From 58c3ab64eed21cbe445ecf797f6a65ae963b94bd Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Tue, 10 Jan 2023 14:17:01 +0800 Subject: [PATCH 01/80] Use AVX512 to optimize bit-packing decode functions. This will improve ORC bit packing performance. Only contains 1~32bit opt. --- CMakeLists.txt | 50 +- c++/src/DetectPlatform.hh | 88 + c++/src/RLEv2.hh | 378 ++- c++/src/RleDecoderV2.cc | 5370 ++++++++++++++++++++++++++---- c++/src/VectorDecoder.hh | 506 +++ c++/test/CMakeLists.txt | 5 +- c++/test/TestRleVectorDecoder.cc | 639 ++++ 7 files changed, 6209 insertions(+), 827 deletions(-) create mode 100644 c++/src/DetectPlatform.hh create mode 100644 c++/src/VectorDecoder.hh create mode 100644 c++/test/TestRleVectorDecoder.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d140d4285..d7f55f11fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,10 @@ option(BUILD_CPP_ENABLE_METRICS "Enable the metrics collection at compile phase" OFF) +option(ENABLE_AVX512_BIT_PACKING + "Enable AVX512 vector decode of bit-packing" + OFF) + # Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) message(STATUS "No build type selected, default to ReleaseWithDebugInfo") @@ -90,14 +94,6 @@ endif () # # Compiler specific flags # -# This ensures that things like c++17 get passed correctly -if(NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 17) -elseif(${CMAKE_CXX_STANDARD} VERSION_LESS 17) - message(FATAL_ERROR "Cannot set a CMAKE_CXX_STANDARD smaller than 17") -endif() -# We require a C++17 compliant compiler -set(CMAKE_CXX_STANDARD_REQUIRED ON) if (NOT MSVC) set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-omit-frame-pointer") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG -fno-omit-frame-pointer") @@ -105,18 +101,13 @@ if (NOT MSVC) endif () message(STATUS "compiler ${CMAKE_CXX_COMPILER_ID} version ${CMAKE_CXX_COMPILER_VERSION}") if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - if (CMAKE_CXX_COMPILER_VERSION STREQUAL "" OR - CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0") - message(FATAL_ERROR "A c++17-compliant compiler is required, please use at least Clang 5") - else () - set (CXX17_FLAGS "-std=c++17") - endif () + set (CXX11_FLAGS "-std=c++11") set (WARN_FLAGS "-Weverything -Wno-c++98-compat -Wno-missing-prototypes") set (WARN_FLAGS "${WARN_FLAGS} -Wno-c++98-compat-pedantic -Wno-padded") set (WARN_FLAGS "${WARN_FLAGS} -Wno-covered-switch-default") set (WARN_FLAGS "${WARN_FLAGS} -Wno-missing-noreturn -Wno-unknown-pragmas") set (WARN_FLAGS "${WARN_FLAGS} -Wno-gnu-zero-variadic-macro-arguments") - set (WARN_FLAGS "${WARN_FLAGS} -Wconversion") + set (WARN_FLAGS "${WARN_FLAGS} -Wno-conversion") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") set (WARN_FLAGS "${WARN_FLAGS} -Wno-reserved-identifier -Wno-suggest-destructor-override -Wno-suggest-override") endif() @@ -129,27 +120,22 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") set (WARN_FLAGS "${WARN_FLAGS} -Werror") endif () elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if (CMAKE_CXX_COMPILER_VERSION STREQUAL "" OR - CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0") - message(FATAL_ERROR "A c++17-compliant compiler is required, please use at least GCC 5") - else () - set (CXX17_FLAGS "-std=c++17") - endif () - set (WARN_FLAGS "-Wall -Wno-unknown-pragmas -Wconversion") + set (WARN_FLAGS "-Wall -Wno-unknown-pragmas -Wno-conversion") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "12.0") set (WARN_FLAGS "${WARN_FLAGS} -Wno-array-bounds -Wno-stringop-overread") # To compile protobuf in Fedora37 + elseif (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9") + set (WARN_FLAGS "${WARN_FLAGS} -Wno-unused-function") endif () if (STOP_BUILD_ON_WARNING) set (WARN_FLAGS "${WARN_FLAGS} -Werror") endif () -elseif (MSVC) - include(CheckCXXCompilerFlag) - CHECK_CXX_COMPILER_FLAG("/std:c++17" CPP17_FLAG_SUPPORTED) - if (CPP17_FLAG_SUPPORTED) - add_compile_options("/std:c++17") + if (CMAKE_CXX_COMPILER_VERSION STREQUAL "" OR + CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.7") + set (CXX11_FLAGS "-std=c++0x") else () - message(FATAL_ERROR "A c++17-compliant compiler is required") + set (CXX11_FLAGS "-std=c++11") endif () +elseif (MSVC) add_definitions (-D_SCL_SECURE_NO_WARNINGS) add_definitions (-D_CRT_SECURE_NO_WARNINGS) add_definitions (-D_CRT_NONSTDC_NO_DEPRECATE) # The POSIX name for this item is deprecated @@ -165,6 +151,14 @@ else () add_compile_definitions(ENABLE_METRICS=0) endif () +if (ENABLE_AVX512_BIT_PACKING) + message(STATUS "Enable the AVX512 vector decode of bit-packing") + add_compile_definitions(ENABLE_AVX512=1) +else () + message(STATUS "Disable the AVX512 vector decode of bit-packing") + add_compile_definitions(ENABLE_AVX512=0) +endif () + enable_testing() INCLUDE(CheckSourceCompiles) diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh new file mode 100644 index 0000000000..4281a8c8a7 --- /dev/null +++ b/c++/src/DetectPlatform.hh @@ -0,0 +1,88 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_DETECTPLATFORM_HH +#define ORC_DETECTPLATFORM_HH + +#ifdef _WIN32 + +#include "intrin.h" +// Windows CPUID +#define cpuid(info, x) __cpuidex(info, x, 0) +#else +// GCC Intrinsics +#include +#include + +void cpuid(int info[4], int InfoType) { + __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); +} + +unsigned long long _xgetbv(unsigned int index) { + unsigned int eax, edx; + __asm__ __volatile__( + "xgetbv;" + : "=a" (eax), "=d"(edx) + : "c" (index) + ); + return ((unsigned long long) edx << 32) | eax; +} + +#endif + +namespace orc +{ + #define CPUID_AVX512F 0x00100000 + #define CPUID_AVX512CD 0x00200000 + #define CPUID_AVX512VL 0x04000000 + #define CPUID_AVX512BW 0x01000000 + #define CPUID_AVX512DQ 0x02000000 + #define EXC_OSXSAVE 0x08000000 // 27th bit + + #define CPUID_AVX512_MASK (CPUID_AVX512F | CPUID_AVX512CD | CPUID_AVX512VL | CPUID_AVX512BW | CPUID_AVX512DQ) + + enum arch_t { + px_arch = 0, + avx2_arch = 1, + avx512_arch = 2 + }; + + arch_t detect_platform() { + arch_t detected_platform = arch_t::px_arch; + int cpu_info[4]; + cpuid(cpu_info, 1); + + bool avx512_support_cpu = cpu_info[1] & CPUID_AVX512_MASK; + bool os_uses_XSAVE_XSTORE = cpu_info[2] & EXC_OSXSAVE; + + if (avx512_support_cpu && os_uses_XSAVE_XSTORE) { + // Check if XMM state and YMM state are saved + unsigned long long xcr_feature_mask = _xgetbv(0); + + if ((xcr_feature_mask & 0x6) == 0x6) { // AVX2 is supported now + if ((xcr_feature_mask & 0xe0) == 0xe0) { // AVX512 is supported now + detected_platform = arch_t::avx512_arch; + } + } + } + + return detected_platform; + } +} + +#endif diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index f48ce8391b..b766d2ae60 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -1,105 +1,78 @@ /** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ #ifndef ORC_RLEV2_HH #define ORC_RLEV2_HH #include "Adaptor.hh" -#include "RLE.hh" #include "orc/Exceptions.hh" +#include "RLE.hh" #include +#define MAX_VECTOR_BUF_8BIT_LENGTH 64 +#define MAX_VECTOR_BUF_16BIT_LENGTH 32 +#define MAX_VECTOR_BUF_32BIT_LENGTH 16 #define MAX_LITERAL_SIZE 512 #define MIN_REPEAT 3 #define HIST_LEN 32 namespace orc { - struct FixedBitSizes { +struct FixedBitSizes { enum FBS { - ONE = 0, - TWO, - THREE, - FOUR, - FIVE, - SIX, - SEVEN, - EIGHT, - NINE, - TEN, - ELEVEN, - TWELVE, - THIRTEEN, - FOURTEEN, - FIFTEEN, - SIXTEEN, - SEVENTEEN, - EIGHTEEN, - NINETEEN, - TWENTY, - TWENTYONE, - TWENTYTWO, - TWENTYTHREE, - TWENTYFOUR, - TWENTYSIX, - TWENTYEIGHT, - THIRTY, - THIRTYTWO, - FORTY, - FORTYEIGHT, - FIFTYSIX, - SIXTYFOUR, - SIZE + ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, + THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, + TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX, + TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE }; - }; - - enum EncodingType { SHORT_REPEAT = 0, DIRECT = 1, PATCHED_BASE = 2, DELTA = 3 }; - - struct EncodingOption { - EncodingType encoding; - int64_t fixedDelta; - int64_t gapVsPatchListCount; - int64_t zigzagLiteralsCount; - int64_t baseRedLiteralsCount; - int64_t adjDeltasCount; - uint32_t zzBits90p; - uint32_t zzBits100p; - uint32_t brBits95p; - uint32_t brBits100p; - uint32_t bitsDeltaMax; - uint32_t patchWidth; - uint32_t patchGapWidth; - uint32_t patchLength; - int64_t min; - bool isFixedDelta; - }; - - class RleEncoderV2 : public RleEncoder { - public: - RleEncoderV2(std::unique_ptr outStream, bool hasSigned, - bool alignBitPacking = true); +}; + +enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 }; + +struct EncodingOption { + EncodingType encoding; + int64_t fixedDelta; + int64_t gapVsPatchListCount; + int64_t zigzagLiteralsCount; + int64_t baseRedLiteralsCount; + int64_t adjDeltasCount; + uint32_t zzBits90p; + uint32_t zzBits100p; + uint32_t brBits95p; + uint32_t brBits100p; + uint32_t bitsDeltaMax; + uint32_t patchWidth; + uint32_t patchGapWidth; + uint32_t patchLength; + int64_t min; + bool isFixedDelta; +}; + +class RleEncoderV2 : public RleEncoder { +public: + RleEncoderV2(std::unique_ptr outStream, bool hasSigned, bool alignBitPacking = true); ~RleEncoderV2() override { - delete[] literals; - delete[] gapVsPatchList; - delete[] zigzagLiterals; - delete[] baseRedLiterals; - delete[] adjDeltas; + delete [] literals; + delete [] gapVsPatchList; + delete [] zigzagLiterals; + delete [] baseRedLiterals; + delete [] adjDeltas; } /** * Flushing underlying BufferedOutputStream @@ -108,19 +81,20 @@ namespace orc { void write(int64_t val) override; - private: +private: + const bool alignedBitPacking; uint32_t fixedRunLength; uint32_t variableRunLength; int64_t prevDelta; int32_t histgram[HIST_LEN]; - // The four list below should actually belong to EncodingOption since it only holds temporal - // values in write(int64_t val), it is move here for performance consideration. + // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val), + // it is move here for performance consideration. int64_t* gapVsPatchList; - int64_t* zigzagLiterals; - int64_t* baseRedLiterals; - int64_t* adjDeltas; + int64_t* zigzagLiterals; + int64_t* baseRedLiterals; + int64_t* adjDeltas; uint32_t getOpCode(EncodingType encoding); int64_t* prepareForDirectOrPatchedBase(EncodingOption& option); @@ -135,102 +109,136 @@ namespace orc { void writeDirectValues(EncodingOption& option); void writePatchedBasedValues(EncodingOption& option); void writeDeltaValues(EncodingOption& option); - uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, - bool reuseHist = false); - }; - - class RleDecoderV2 : public RleDecoder { - public: - RleDecoderV2(std::unique_ptr input, bool isSigned, MemoryPool& pool, - ReaderMetrics* metrics); - - /** - * Seek to a particular spot. - */ - void seek(PositionProvider&) override; - - /** - * Seek over a given number of values. - */ - void skip(uint64_t numValues) override; - - /** - * Read a number of values into the batch. - */ - template - void next(T* data, uint64_t numValues, const char* notNull); - - void next(int64_t* data, uint64_t numValues, const char* notNull) override; - - void next(int32_t* data, uint64_t numValues, const char* notNull) override; - - void next(int16_t* data, uint64_t numValues, const char* notNull) override; - - private: - /** - * Decode the next gap and patch from 'unpackedPatch' and update the index on it. - * Used by PATCHED_BASE. - * - * @param patchBitSize bit size of the patch value - * @param patchMask mask for the patch value - * @param resGap result of gap - * @param resPatch result of patch - * @param patchIdx current index in the 'unpackedPatch' buffer - */ - void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap, - int64_t* resPatch, uint64_t* patchIdx); - - void resetReadLongs() { - bitsLeft = 0; - curByte = 0; - } - - void resetRun() { - resetReadLongs(); - } - - unsigned char readByte(); - - int64_t readLongBE(uint64_t bsz); - int64_t readVslong(); - uint64_t readVulong(); - void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); - void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); - - void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len); - - template - uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull); - template - uint64_t nextDirect(T* data, uint64_t offset, uint64_t numValues, const char* notNull); - template - uint64_t nextPatched(T* data, uint64_t offset, uint64_t numValues, const char* notNull); - template - uint64_t nextDelta(T* data, uint64_t offset, uint64_t numValues, const char* notNull); - template - uint64_t copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull); - - const std::unique_ptr inputStream; - const bool isSigned; - - unsigned char firstByte; - uint64_t runLength; // Length of the current run - uint64_t runRead; // Number of returned values of the current run - const char* bufferStart; - const char* bufferEnd; - uint32_t bitsLeft; // Used by readLongs when bitSize < 8 - uint32_t curByte; // Used by anything that uses readLongs - DataBuffer unpackedPatch; // Used by PATCHED_BASE - DataBuffer literals; // Values of the current run - }; + uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false); +}; + +class RleDecoderV2 : public RleDecoder { +public: + RleDecoderV2(std::unique_ptr input, + bool isSigned, MemoryPool& pool, + ReaderMetrics* metrics); + + /** + * Seek to a particular spot. + */ + void seek(PositionProvider&) override; + + /** + * Seek over a given number of values. + */ + void skip(uint64_t numValues) override; + + /** + * Read a number of values into the batch. + */ + void next(int64_t* data, uint64_t numValues, + const char* notNull) override; + +private: + + /** + * Decode the next gap and patch from 'unpackedPatch' and update the index on it. + * Used by PATCHED_BASE. + * + * @param patchBitSize bit size of the patch value + * @param patchMask mask for the patch value + * @param resGap result of gap + * @param resPatch result of patch + * @param patchIdx current index in the 'unpackedPatch' buffer + */ + void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, + int64_t* resGap, int64_t* resPatch, uint64_t* patchIdx); + + void resetReadLongs() { + bitsLeft = 0; + curByte = 0; + } + + void resetRun() { + resetReadLongs(); + } + + void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); + + unsigned char readByte(); + + int64_t readLongBE(uint64_t bsz); + int64_t readVslong(); + uint64_t readVulong(); + void readLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs); + void plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs, + uint64_t& startBit); + +#if ENABLE_AVX512 + void unrolledUnpackVector1(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector2(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector3(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector4(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector5(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector6(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector7(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector9(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector10(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector11(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector12(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector13(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector14(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector15(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector16(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector17(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector18(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector19(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector20(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector21(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector22(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector23(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector24(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector26(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector28(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector30(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector32(int64_t *data, uint64_t offset, uint64_t len); +#endif + + void unrolledUnpack4(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpack8(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpack16(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpack24(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpack32(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpack40(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len); + + uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + + uint64_t copyDataFromBuffer(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + + const std::unique_ptr inputStream; + const bool isSigned; + + unsigned char firstByte; + uint64_t runLength; // Length of the current run + uint64_t runRead; // Number of returned values of the current run + const char *bufferStart; + const char *bufferEnd; + uint32_t bitsLeft; // Used by readLongs when bitSize < 8 + uint32_t curByte; // Used by anything that uses readLongs + DataBuffer unpackedPatch; // Used by PATCHED_BASE + DataBuffer literals; // Values of the current run +#if ENABLE_AVX512 + uint8_t vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data + uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + 1]; // Used by vectorially 9~16 bit-unpacking data + uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + 1]; // Used by vectorially 17~32 bit-unpacking data +#endif +}; } // namespace orc #endif // ORC_RLEV2_HH diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 2742aef6f6..6cfc7bf2e0 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -18,55 +18,185 @@ #include "Adaptor.hh" #include "Compression.hh" -#include "RLEV2Util.hh" #include "RLEv2.hh" +#include "RLEV2Util.hh" +#include "VectorDecoder.hh" +#include "DetectPlatform.hh" #include "Utils.hh" namespace orc { +void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { + uint64_t restLen = bufferEnd - bufferStart; + int bufferLength = 0; + const void* bufferPointer = nullptr; - unsigned char RleDecoderV2::readByte() { - SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); - if (bufferStart == bufferEnd) { - int bufferLength; - const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in RleDecoderV2::readByte"); - } - bufferStart = static_cast(bufferPointer); - bufferEnd = bufferStart + bufferLength; + if (backupByteLen != 0) { + inputStream->BackUp(backupByteLen); + } + + if (len >= restLen && resetBuf == true) { + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::resetBufferStart"); } + } - unsigned char result = static_cast(*bufferStart++); - return result; + if (bufferPointer == nullptr) { + bufferStart += len; + } else { + bufferStart = static_cast(bufferPointer); + bufferEnd = bufferStart + bufferLength; } +} - int64_t RleDecoderV2::readLongBE(uint64_t bsz) { - int64_t ret = 0, val; - uint64_t n = bsz; - while (n > 0) { - n--; - val = readByte(); - ret |= (val << (n * 8)); +unsigned char RleDecoderV2::readByte() { + SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); + if (bufferStart == bufferEnd) { + int bufferLength; + const void* bufferPointer; + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::readByte"); } - return ret; + bufferStart = static_cast(bufferPointer); + bufferEnd = bufferStart + bufferLength; } - inline int64_t RleDecoderV2::readVslong() { - return unZigZag(readVulong()); - } + unsigned char result = static_cast(*bufferStart++); + return result; +} - uint64_t RleDecoderV2::readVulong() { - uint64_t ret = 0, b; - uint64_t offset = 0; - do { - b = readByte(); - ret |= (0x7f & b) << offset; - offset += 7; - } while (b >= 0x80); - return ret; +int64_t RleDecoderV2::readLongBE(uint64_t bsz) { + int64_t ret = 0, val; + uint64_t n = bsz; + while (n > 0) { + n--; + val = readByte(); + ret |= (val << (n * 8)); } + return ret; +} + +inline int64_t RleDecoderV2::readVslong() { + return unZigZag(readVulong()); +} - void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { +uint64_t RleDecoderV2::readVulong() { + uint64_t ret = 0, b; + uint64_t offset = 0; + do { + b = readByte(); + ret |= (0x7f & b) << offset; + offset += 7; + } while (b >= 0x80); + return ret; +} + +void RleDecoderV2::readLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs) { + uint64_t startBit = 0; +#if ENABLE_AVX512 + if (detect_platform() == arch_t::avx512_arch) { + switch (fbs) { + case 1: + unrolledUnpackVector1(data, offset, len); + return; + case 2: + unrolledUnpackVector2(data, offset, len); + return; + case 3: + unrolledUnpackVector3(data, offset, len); + return; + case 4: + unrolledUnpackVector4(data, offset, len); + return; + case 5: + unrolledUnpackVector5(data, offset, len); + return; + case 6: + unrolledUnpackVector6(data, offset, len); + return; + case 7: + unrolledUnpackVector7(data, offset, len); + return; + case 8: + unrolledUnpack8(data, offset, len); + return; + case 9: + unrolledUnpackVector9(data, offset, len); + return; + case 10: + unrolledUnpackVector10(data, offset, len); + return; + case 11: + unrolledUnpackVector11(data, offset, len); + return; + case 12: + unrolledUnpackVector12(data, offset, len); + return; + case 13: + unrolledUnpackVector13(data, offset, len); + return; + case 14: + unrolledUnpackVector14(data, offset, len); + return; + case 15: + unrolledUnpackVector15(data, offset, len); + return; + case 16: + unrolledUnpackVector16(data, offset, len); + return; + case 17: + unrolledUnpackVector17(data, offset, len); + return; + case 18: + unrolledUnpackVector18(data, offset, len); + return; + case 19: + unrolledUnpackVector19(data, offset, len); + return; + case 20: + unrolledUnpackVector20(data, offset, len); + return; + case 21: + unrolledUnpackVector21(data, offset, len); + return; + case 22: + unrolledUnpackVector22(data, offset, len); + return; + case 23: + unrolledUnpackVector23(data, offset, len); + return; + case 24: + unrolledUnpackVector24(data, offset, len); + return; + case 26: + unrolledUnpackVector26(data, offset, len); + return; + case 28: + unrolledUnpackVector28(data, offset, len); + return; + case 30: + unrolledUnpackVector30(data, offset, len); + return; + case 32: + unrolledUnpackVector32(data, offset, len); + return; + case 40: + unrolledUnpack40(data, offset, len); + return; + case 48: + unrolledUnpack48(data, offset, len); + return; + case 56: + unrolledUnpack56(data, offset, len); + return; + case 64: + unrolledUnpack64(data, offset, len); + return; + default: + // Fallback to the default implementation for deprecated bit size. + plainUnpackLongs(data, offset, len, fbs, startBit); + return; + } + } else { switch (fbs) { case 4: unrolledUnpack4(data, offset, len); @@ -97,668 +227,4686 @@ namespace orc { return; default: // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs); + plainUnpackLongs(data, offset, len, fbs, startBit); return; } } +#else + switch (fbs) { + case 4: + unrolledUnpack4(data, offset, len); + return; + case 8: + unrolledUnpack8(data, offset, len); + return; + case 16: + unrolledUnpack16(data, offset, len); + return; + case 24: + unrolledUnpack24(data, offset, len); + return; + case 32: + unrolledUnpack32(data, offset, len); + return; + case 40: + unrolledUnpack40(data, offset, len); + return; + case 48: + unrolledUnpack48(data, offset, len); + return; + case 56: + unrolledUnpack56(data, offset, len); + return; + case 64: + unrolledUnpack64(data, offset, len); + return; + default: + // Fallback to the default implementation for deprecated bit size. + plainUnpackLongs(data, offset, len, fbs, startBit); + return; + } +#endif +} + +#if ENABLE_AVX512 +void RleDecoderV2::unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 1; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint32_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } - void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. - while (bitsLeft > 0 && curIdx < offset + len) { - bitsLeft -= 4; - data[curIdx++] = (curByte >> bitsLeft) & 15; + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; } - if (curIdx == offset + len) return; + } - // Exhaust the buffer - uint64_t numGroups = (offset + len - curIdx) / 2; - numGroups = std::min(numGroups, static_cast(bufferEnd - bufferStart)); - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - uint32_t localByte; - for (uint64_t i = 0; i < numGroups; ++i) { - localByte = *buffer++; - data[curIdx] = (localByte >> 4) & 15; - data[curIdx + 1] = localByte & 15; - curIdx += 2; + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } - // readByte() will update 'bufferStart' and 'bufferEnd' - curByte = readByte(); - bitsLeft = 8; + if (numElements >= 64) { + __m512i reverseMask1u = _mm512_load_si512(reverseMaskTable1u); + while (numElements >= 64) { + uint64_t src_64 = *(uint64_t *)srcPtr; + // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF + __m512i srcmm = _mm512_movm_epi8(src_64); + // make 0x00 --> 0x00, 0xFF --> 0x01 + srcmm = _mm512_abs_epi8(srcmm); + srcmm = _mm512_shuffle_epi8(srcmm, reverseMask1u); + _mm512_storeu_si512(vectorBuf8, srcmm); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } } - } - void RleDecoderV2::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = bufferEnd - bufferStart; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - data[curIdx++] = *buffer++; + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } - // readByte() will update 'bufferStart' and 'bufferEnd'. - data[curIdx++] = readByte(); + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); } +} + +void RleDecoderV2::unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 2; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint32_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; - void RleDecoderV2::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 2; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint16_t b0, b1; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - buffer += 2; - data[curIdx++] = (b0 << 8) | b1; + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; + } - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - data[curIdx++] = (b0 << 8) | b1; + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; } - } - void RleDecoderV2::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 3; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint32_t b0, b1, b2; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - buffer += 3; - data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); - } - bufferStart += bufferNum * 3; - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } } - } - void RleDecoderV2::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 4; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint32_t b0, b1, b2, b3; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - buffer += 4; - data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements) + __m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0 + while (numElements >= 64) { + __m512i srcmm3 = _mm512_maskz_loadu_epi8(readMask, srcPtr); + __m512i srcmm0, srcmm1, srcmm2, tmpmm; + + srcmm2 = _mm512_srli_epi16(srcmm3, 2); + srcmm1 = _mm512_srli_epi16(srcmm3, 4); + srcmm0 = _mm512_srli_epi16(srcmm3, 6); + + // turn 2 bitWidth into 8 by zeroing 3 of each 4 elements. + // move them into their places + // srcmm0: a e i m 0 0 0 0 0 0 0 0 0 0 0 0 + // srcmm1: b f j n 0 0 0 0 0 0 0 0 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 00 00 00 00 + srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // ij mn 00 00 00 00 00 00 + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x00); // ab ef ab ef ij mn ij mn + + // srcmm2: c g k o 0 0 0 0 0 0 0 0 0 0 0 0 + // srcmm3: d h l p 0 0 0 0 0 0 0 0 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm2, srcmm3); // cd gh 00 00 00 00 00 00 + srcmm1 = _mm512_unpackhi_epi8(srcmm2, srcmm3); // kl op 00 00 00 00 00 00 + srcmm1 = _mm512_shuffle_i64x2(tmpmm, srcmm1, 0x00); // cd gh cd gh kl op kl op + + tmpmm = _mm512_unpacklo_epi16(srcmm0, srcmm1); // abcd abcd ijkl ijkl + srcmm0 = _mm512_unpackhi_epi16(srcmm0, srcmm1); // efgh efgh mnop mnop + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x88); // abcd ijkl efgh mnop + srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // abcd efgh ijkl mnop + + srcmm0 = _mm512_and_si512(srcmm0, parse_mask); + + _mm512_storeu_si512(vectorBuf8, srcmm0); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } } - } - void RleDecoderV2::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 5; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - buffer += 5; - data[curIdx++] = - static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); } +} - void RleDecoderV2::unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 6; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - buffer += 6; - data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | - (b4 << 8) | b5); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - data[curIdx++] = - static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); - } - } - - void RleDecoderV2::unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 7; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - b6 = static_cast(*(buffer + 6)); - buffer += 7; - data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | - (b4 << 16) | (b5 << 8) | b6); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | - (b4 << 16) | (b5 << 8) | b6); - } - } - - void RleDecoderV2::unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 8; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6, b7; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - b6 = static_cast(*(buffer + 6)); - b7 = static_cast(*(buffer + 7)); - buffer += 8; - data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | - (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - b7 = readByte(); - data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | - (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); - } - } - - void RleDecoderV2::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { - for (uint64_t i = offset; i < (offset + len); i++) { - uint64_t result = 0; - uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = readByte(); - bitsLeft = 8; - } - - // handle the left over bits - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= static_cast(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); - } - data[i] = static_cast(result); - } - } - - RleDecoderV2::RleDecoderV2(std::unique_ptr input, bool _isSigned, - MemoryPool& pool, ReaderMetrics* _metrics) - : RleDecoder(_metrics), - inputStream(std::move(input)), - isSigned(_isSigned), - firstByte(0), - runLength(0), - runRead(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - bitsLeft(0), - curByte(0), - unpackedPatch(pool, 0), - literals(pool, MAX_LITERAL_SIZE) { - // PASS - } - - void RleDecoderV2::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // clear state - bufferEnd = bufferStart = nullptr; - runRead = runLength = 0; - // skip ahead the given number of records - skip(location.next()); - } - - void RleDecoderV2::skip(uint64_t numValues) { - // simple for now, until perf tests indicate something encoding specific is - // needed - const uint64_t N = 64; - int64_t dummy[N]; - - while (numValues) { - uint64_t nRead = std::min(N, numValues); - next(dummy, nRead, nullptr); - numValues -= nRead; - } - } - - template - void RleDecoderV2::next(T* const data, const uint64_t numValues, const char* const notNull) { - SCOPED_STOPWATCH(metrics, DecodingLatencyUs, DecodingCall); - uint64_t nRead = 0; - - while (nRead < numValues) { - // Skip any nulls before attempting to read first byte. - while (notNull && !notNull[nRead]) { - if (++nRead == numValues) { - return; // ended with null values - } +void RleDecoderV2::unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 3; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint32_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } - if (runRead == runLength) { - resetRun(); - firstByte = readByte(); + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - uint64_t offset = nRead, length = numValues - nRead; + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable3u); - EncodingType enc = static_cast((firstByte >> 6) & 0x03); - switch (static_cast(enc)) { - case SHORT_REPEAT: - nRead += nextShortRepeats(data, offset, length, notNull); - break; - case DIRECT: - nRead += nextDirect(data, offset, length, notNull); - break; - case PATCHED_BASE: - nRead += nextPatched(data, offset, length, notNull); - break; - case DELTA: - nRead += nextDelta(data, offset, length, notNull); - break; - default: - throw ParseError("unknown encoding"); + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable3u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable3u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable3u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable3u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; } } - } - void RleDecoderV2::next(int64_t* data, uint64_t numValues, const char* notNull) { - next(data, numValues, notNull); - } + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } - void RleDecoderV2::next(int32_t* data, uint64_t numValues, const char* notNull) { - next(data, numValues, notNull); - } + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } - void RleDecoderV2::next(int16_t* data, uint64_t numValues, const char* notNull) { - next(data, numValues, notNull); - } + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } - template - uint64_t RleDecoderV2::nextShortRepeats(T* const data, uint64_t offset, uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bytes - uint64_t byteSize = (firstByte >> 3) & 0x07; - byteSize += 1; + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} - runLength = firstByte & 0x07; - // run lengths values are stored only after MIN_REPEAT value is met - runLength += MIN_REPEAT; - runRead = 0; +void RleDecoderV2::unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len){ + uint32_t bitWidth = 4; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; - // read the repeated value which is store using fixed bytes - literals[0] = readLongBE(byteSize); + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } - if (isSigned) { - literals[0] = unZigZag(static_cast(literals[0])); + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; } } - uint64_t nRead = std::min(runLength - runRead, numValues); + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } - if (notNull) { - for (uint64_t pos = offset; pos < offset + nRead; ++pos) { - if (notNull[pos]) { - data[pos] = static_cast(literals[0]); - ++runRead; - } + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; } - } else { - for (uint64_t pos = offset; pos < offset + nRead; ++pos) { - data[pos] = static_cast(literals[0]); - ++runRead; + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; } } - return nRead; - } + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements) + __m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0 + while (numElements >= 64) { + __m512i srcmm0, srcmm1, tmpmm; - template - uint64_t RleDecoderV2::nextDirect(T* const data, uint64_t offset, uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - uint32_t bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - - readLongs(literals.data(), 0, runLength, bitSize); - if (isSigned) { - for (uint64_t i = 0; i < runLength; ++i) { - literals[i] = unZigZag(static_cast(literals[i])); - } + srcmm1 = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm0 = _mm512_srli_epi16(srcmm1, 4); + + // move elements into their places + // srcmm0: a c e g 0 0 0 0 + // srcmm1: b d f h 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 + srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // cd gh 00 00 + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x44); // ab ef cd gh + srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // ab cd ef gh + + // turn 4 bitWidth into 8 by zeroing 4 of each 8 bits. + srcmm0 = _mm512_and_si512(srcmm0, parseMask); + + _mm512_storeu_si512(vectorBuf8, srcmm0); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; } } - return copyDataFromBuffer(data, offset, numValues, notNull); - } + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } - void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap, - int64_t* resPatch, uint64_t* patchIdx) { - uint64_t idx = *patchIdx; - uint64_t gap = static_cast(unpackedPatch[idx]) >> patchBitSize; - int64_t patch = unpackedPatch[idx] & patchMask; - int64_t actualGap = 0; + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } - // special case: gap is >255 then patch value will be 0. - // if gap is <=255 then patch value cannot be 0 - while (gap == 255 && patch == 0) { - actualGap += 255; - ++idx; - gap = static_cast(unpackedPatch[idx]) >> patchBitSize; - patch = unpackedPatch[idx] & patchMask; + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); } - // add the left over gap - actualGap += gap; - *resGap = actualGap; - *resPatch = patch; - *patchIdx = idx; + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); } +} - template - uint64_t RleDecoderV2::nextPatched(T* const data, uint64_t offset, uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - uint32_t bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - - // extract the number of bytes occupied by base - uint64_t thirdByte = readByte(); - uint64_t byteSize = (thirdByte >> 5) & 0x07; - // base width is one off - byteSize += 1; - - // extract patch width - uint32_t pwo = thirdByte & 0x1f; - uint32_t patchBitSize = decodeBitWidth(pwo); - - // read fourth byte and extract patch gap width - uint64_t fourthByte = readByte(); - uint32_t pgw = (fourthByte >> 5) & 0x07; - // patch gap width is one off - pgw += 1; - - // extract the length of the patch list - size_t pl = fourthByte & 0x1f; - if (pl == 0) { - throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); - } - - // read the next base width number of bytes to extract base value - int64_t base = readLongBE(byteSize); - int64_t mask = (static_cast(1) << ((byteSize * 8) - 1)); - // if mask of base value is 1 then base is negative value else positive - if ((base & mask) != 0) { - base = base & ~mask; - base = -base; - } - - readLongs(literals.data(), 0, runLength, bitSize); - // any remaining bits are thrown out - resetReadLongs(); - - // TODO: something more efficient than resize - unpackedPatch.resize(pl); - // TODO: Skip corrupt? - // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { - if ((patchBitSize + pgw) > 64) { - throw ParseError( - "Corrupt PATCHED_BASE encoded data " - "(patchBitSize + pgw > 64)!"); - } - uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); - readLongs(unpackedPatch.data(), 0, pl, cfb); - // any remaining bits are thrown out - resetReadLongs(); - - // apply the patch directly when decoding the packed data - int64_t patchMask = ((static_cast(1) << patchBitSize) - 1); - - int64_t gap = 0; - int64_t patch = 0; - uint64_t patchIdx = 0; - adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); +void RleDecoderV2::unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len){ + uint32_t bitWidth = 5; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; - for (uint64_t i = 0; i < runLength; ++i) { - if (static_cast(i) != gap) { - // no patching required. add base to unpacked value to get final value - literals[i] += base; - } else { - // extract the patch value - int64_t patchedVal = literals[i] | (patch << bitSize); + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } - // add base to patched value - literals[i] = base + patchedVal; + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } - // increment the patch to point to next entry in patch list - ++patchIdx; + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - if (patchIdx < unpackedPatch.size()) { - adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable5u); - // next gap is relative to the current gap - gap += i; - } - } + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable5u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable5u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable5u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable5u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; } } - return copyDataFromBuffer(data, offset, numValues, notNull); + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); } +} - template - uint64_t RleDecoderV2::nextDelta(T* const data, uint64_t offset, uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - uint32_t bitSize; - if (fbo != 0) { - bitSize = decodeBitWidth(fbo); +void RleDecoderV2::unrolledUnpackVector6(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 6; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; } else { - bitSize = 0; + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; } + } - // extract the run length - runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); - ++runLength; // account for first value - runRead = 0; + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } - int64_t prevValue; - // read the first value stored as vint - if (isSigned) { - prevValue = readVslong(); - } else { - prevValue = static_cast(readVulong()); + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; } + } - literals[0] = prevValue; + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - // read the fixed delta value stored as vint (deltas can be negative even - // if all number are positive) - int64_t deltaBase = readVslong(); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable6u); - if (bitSize == 0) { - // add fixed deltas to adjacent values - for (uint64_t i = 1; i < runLength; ++i) { - literals[i] = literals[i - 1] + deltaBase; - } + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable6u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable6u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable6u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable6u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); } else { - prevValue = literals[1] = prevValue + deltaBase; - if (runLength < 2) { - std::stringstream ss; - ss << "Illegal run length for delta encoding: " << runLength; - throw ParseError(ss.str()); - } - // write the unpacked values, add it to previous value and store final - // value to result buffer. if the delta base value is negative then it - // is a decreasing sequence else an increasing sequence. - // read deltas using the literals buffer. - readLongs(literals.data(), 2, runLength - 2, bitSize); - if (deltaBase < 0) { - for (uint64_t i = 2; i < runLength; ++i) { - prevValue = literals[i] = prevValue - literals[i]; - } - } else { - for (uint64_t i = 2; i < runLength; ++i) { - prevValue = literals[i] = prevValue + literals[i]; - } - } + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); } - return copyDataFromBuffer(data, offset, numValues, notNull); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); } +} - template - uint64_t RleDecoderV2::copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, - const char* notNull) { - uint64_t nRead = std::min(runLength - runRead, numValues); - if (notNull) { - for (uint64_t i = offset; i < (offset + nRead); ++i) { - if (notNull[i]) { - data[i] = static_cast(literals[runRead++]); - } - } +void RleDecoderV2::unrolledUnpackVector7(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 7; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH , ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; } else { - for (uint64_t i = offset; i < (offset + nRead); ++i) { - data[i] = static_cast(literals[runRead++]); + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable7u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable7u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable7u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable7u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable7u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector9(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 9; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable9u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable9u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable9u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable9u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable9u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable9u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable9u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 7); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector10(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 10; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable10u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable10u); + __m512i shiftMask = _mm512_load_si512(shiftTable10u); + + while (numElements >= 32) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi16(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector11(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 11; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable11u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable11u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable11u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable11u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable11u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable11u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable11u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable11u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable11u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 5); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector12(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 12; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable12u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable12u); + __m512i shiftMask = _mm512_load_si512(shiftTable12u); + + while (numElements >= 32) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi16(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector13(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 13; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable13u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable13u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable13u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable13u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable13u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable13u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable13u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable13u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable13u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 3); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector14(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 14; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable14u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable14u_1); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable14u); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable14u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable14u_1); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector15(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 15; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable15u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable15u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable15u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable15u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable15u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable15u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable15u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable15u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable15u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 1); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector16(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 16; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = len; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + int64_t* dstPtr = data + offset; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + } else { + numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (numElements >= 32) { + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + while (numElements >= 32) { + __m512i srcmm = _mm512_loadu_si512(srcPtr); + srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u); + _mm512_storeu_si512(vectorBuf16, srcmm); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + unrolledUnpack16(dstPtr, 0, numElements); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; + unrolledUnpack16(dstPtr, 0, 1); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector17(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 17; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable17u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable17u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable17u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable17u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable17u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable17u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable17u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 15); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector18(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 18; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable18u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable18u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable18u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable18u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable18u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable18u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable18u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 14); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector19(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 19; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable19u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable19u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable19u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable19u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable19u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable19u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable19u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 13); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector20(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 20; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0u) { + uint32_t align = getAlign(startBit, bitWidth, 32u); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16u) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable20u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable20u); + __m512i shiftMask = _mm512_load_si512(shiftTable20u); + + while (numElements >= 16u) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi32(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector21(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 21; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0u) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable21u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable21u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable21u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable21u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable21u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable21u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable21u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 11); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector22(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 22; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable22u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable22u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable22u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable22u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable22u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable22u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable22u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 10); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector23(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 23; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable23u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable23u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable23u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable23u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable23u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable23u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable23u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 9); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector24(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 24; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + } else { + numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + + __m512i shuffleIdx = _mm512_load_si512(shuffleIdxTable24u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable24u); + + while (numElements >= 16) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdx); + + _mm512_storeu_si512(vectorBuf32, zmm); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + unrolledUnpack24(dstPtr, 0, numElements); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; + unrolledUnpack24(dstPtr, 0, 1); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector26(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 26; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable26u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable26u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable26u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable26u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable26u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable26u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable26u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 6); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector28(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 28; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable28u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable28u); + __m512i shiftMask = _mm512_load_si512(shiftTable28u); + + while (numElements >= 16) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi32(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector30(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 30; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable30u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable30u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable30u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable30u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable30u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable30u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable30u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable30u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable30u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 2u); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4u); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} + +void RleDecoderV2::unrolledUnpackVector32(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 32; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + } else { + numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (numElements >= 16) { + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + while (numElements >= 16) { + __m512i srcmm = _mm512_loadu_si512(srcPtr); + srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u); + _mm512_storeu_si512(vectorBuf32, srcmm); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + unrolledUnpack32(dstPtr, 0, numElements); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; + unrolledUnpack32(dstPtr, 0, 1); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } +} +#endif + +void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. + while (bitsLeft > 0 && curIdx < offset + len) { + bitsLeft -= 4; + data[curIdx++] = (curByte >> bitsLeft) & 15; + } + if (curIdx == offset + len) return; + + // Exhaust the buffer + uint64_t numGroups = (offset + len - curIdx) / 2; + numGroups = std::min(numGroups, static_cast(bufferEnd - bufferStart)); + // Avoid updating 'bufferStart' inside the loop. + const auto *buffer = reinterpret_cast(bufferStart); + uint32_t localByte; + for (uint64_t i = 0; i < numGroups; ++i) { + localByte = *buffer++; + data[curIdx] = (localByte >> 4) & 15; + data[curIdx + 1] = localByte & 15; + curIdx += 2; + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // readByte() will update 'bufferStart' and 'bufferEnd' + curByte = readByte(); + bitsLeft = 8; + } +} + +void RleDecoderV2::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = bufferEnd - bufferStart; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + data[curIdx++] = *buffer++; + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // readByte() will update 'bufferStart' and 'bufferEnd'. + data[curIdx++] = readByte(); + } +} + +void RleDecoderV2::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 2; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint16_t b0, b1; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + buffer += 2; + data[curIdx++] = (b0 << 8) | b1; + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + data[curIdx++] = (b0 << 8) | b1; + } +} + +void RleDecoderV2::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 3; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint32_t b0, b1, b2; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + buffer += 3; + data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); + } + bufferStart += bufferNum * 3; + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); + } +} + +void RleDecoderV2::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 4; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint32_t b0, b1, b2, b3; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + buffer += 4; + data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + } +} + +void RleDecoderV2::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 5; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + buffer += 5; + data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + b4 = readByte(); + data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } +} + +void RleDecoderV2::unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 6; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + buffer += 6; + data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + b4 = readByte(); + b5 = readByte(); + data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); + } +} + +void RleDecoderV2::unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 7; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + b6 = static_cast(*(buffer + 6)); + buffer += 7; + data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + b4 = readByte(); + b5 = readByte(); + b6 = readByte(); + data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); + } +} + +void RleDecoderV2::unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 8; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6, b7; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + b6 = static_cast(*(buffer + 6)); + b7 = static_cast(*(buffer + 7)); + buffer += 8; + data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + b4 = readByte(); + b5 = readByte(); + b6 = readByte(); + b7 = readByte(); + data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } +} + +void RleDecoderV2::plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, + uint64_t fbs, uint64_t& startBit) { + for (uint64_t i = offset; i < (offset + len); i++) { + uint64_t result = 0; + uint64_t bitsLeftToRead = fbs; + while (bitsLeftToRead > bitsLeft) { + result <<= bitsLeft; + result |= curByte & ((1 << bitsLeft) - 1); + bitsLeftToRead -= bitsLeft; + curByte = readByte(); + bitsLeft = 8; + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + bitsLeft -= static_cast(bitsLeftToRead); + result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + } + data[i] = static_cast(result); + startBit = bitsLeft == 0 ? 0 : (8 - bitsLeft); + } +} + +RleDecoderV2::RleDecoderV2(std::unique_ptr input, + bool _isSigned, MemoryPool& pool, + ReaderMetrics* _metrics + ): RleDecoder(_metrics), + inputStream(std::move(input)), + isSigned(_isSigned), + firstByte(0), + runLength(0), + runRead(0), + bufferStart(nullptr), + bufferEnd(bufferStart), + bitsLeft(0), + curByte(0), + unpackedPatch(pool, 0), + literals(pool, MAX_LITERAL_SIZE) { + // PASS +} + +void RleDecoderV2::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // clear state + bufferEnd = bufferStart = nullptr; + runRead = runLength = 0; + // skip ahead the given number of records + skip(location.next()); +} + +void RleDecoderV2::skip(uint64_t numValues) { + // simple for now, until perf tests indicate something encoding specific is + // needed + const uint64_t N = 64; + int64_t dummy[N]; + + while (numValues) { + uint64_t nRead = std::min(N, numValues); + next(dummy, nRead, nullptr); + numValues -= nRead; + } +} + +void RleDecoderV2::next(int64_t* const data, + const uint64_t numValues, + const char* const notNull) { + SCOPED_STOPWATCH(metrics, DecodingLatencyUs, DecodingCall); + uint64_t nRead = 0; + + while (nRead < numValues) { + // Skip any nulls before attempting to read first byte. + while (notNull && !notNull[nRead]) { + if (++nRead == numValues) { + return; // ended with null values + } + } + + if (runRead == runLength) { + resetRun(); + firstByte = readByte(); + } + + uint64_t offset = nRead, length = numValues - nRead; + + EncodingType enc = static_cast + ((firstByte >> 6) & 0x03); + switch(static_cast(enc)) { + case SHORT_REPEAT: + nRead += nextShortRepeats(data, offset, length, notNull); + break; + case DIRECT: + nRead += nextDirect(data, offset, length, notNull); + break; + case PATCHED_BASE: + nRead += nextPatched(data, offset, length, notNull); + break; + case DELTA: + nRead += nextDelta(data, offset, length, notNull); + break; + default: + throw ParseError("unknown encoding"); + } + } +} + +uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bytes + uint64_t byteSize = (firstByte >> 3) & 0x07; + byteSize += 1; + + runLength = firstByte & 0x07; + // run lengths values are stored only after MIN_REPEAT value is met + runLength += MIN_REPEAT; + runRead = 0; + + // read the repeated value which is store using fixed bytes + literals[0] = readLongBE(byteSize); + + if (isSigned) { + literals[0] = unZigZag(static_cast(literals[0])); + } + } + + uint64_t nRead = std::min(runLength - runRead, numValues); + + if (notNull) { + for(uint64_t pos = offset; pos < offset + nRead; ++pos) { + if (notNull[pos]) { + data[pos] = literals[0]; + ++runRead; + } + } + } else { + for(uint64_t pos = offset; pos < offset + nRead; ++pos) { + data[pos] = literals[0]; + ++runRead; + } + } + + return nRead; +} + +uint64_t RleDecoderV2::nextDirect(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + uint32_t bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + + readLongs(literals.data(), 0, runLength, bitSize); + if (isSigned) { + for (uint64_t i = 0; i < runLength; ++i) { + literals[i] = unZigZag(static_cast(literals[i])); + } + } + } + + return copyDataFromBuffer(data, offset, numValues, notNull); +} + +void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, + int64_t* resGap, int64_t* resPatch, + uint64_t* patchIdx) { + uint64_t idx = *patchIdx; + uint64_t gap = static_cast(unpackedPatch[idx]) >> patchBitSize; + int64_t patch = unpackedPatch[idx] & patchMask; + int64_t actualGap = 0; + + // special case: gap is >255 then patch value will be 0. + // if gap is <=255 then patch value cannot be 0 + while (gap == 255 && patch == 0) { + actualGap += 255; + ++idx; + gap = static_cast(unpackedPatch[idx]) >> patchBitSize; + patch = unpackedPatch[idx] & patchMask; + } + // add the left over gap + actualGap += gap; + + *resGap = actualGap; + *resPatch = patch; + *patchIdx = idx; +} + +uint64_t RleDecoderV2::nextPatched(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + uint32_t bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + + // extract the number of bytes occupied by base + uint64_t thirdByte = readByte(); + uint64_t byteSize = (thirdByte >> 5) & 0x07; + // base width is one off + byteSize += 1; + + // extract patch width + uint32_t pwo = thirdByte & 0x1f; + uint32_t patchBitSize = decodeBitWidth(pwo); + + // read fourth byte and extract patch gap width + uint64_t fourthByte = readByte(); + uint32_t pgw = (fourthByte >> 5) & 0x07; + // patch gap width is one off + pgw += 1; + + // extract the length of the patch list + size_t pl = fourthByte & 0x1f; + if (pl == 0) { + throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); + } + + // read the next base width number of bytes to extract base value + int64_t base = readLongBE(byteSize); + int64_t mask = (static_cast(1) << ((byteSize * 8) - 1)); + // if mask of base value is 1 then base is negative value else positive + if ((base & mask) != 0) { + base = base & ~mask; + base = -base; + } + + readLongs(literals.data(), 0, runLength, bitSize); + // any remaining bits are thrown out + resetReadLongs(); + + // TODO: something more efficient than resize + unpackedPatch.resize(pl); + // TODO: Skip corrupt? + // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { + if ((patchBitSize + pgw) > 64) { + throw ParseError("Corrupt PATCHED_BASE encoded data " + "(patchBitSize + pgw > 64)!"); + } + uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); + readLongs(unpackedPatch.data(), 0, pl, cfb); + // any remaining bits are thrown out + resetReadLongs(); + + // apply the patch directly when decoding the packed data + int64_t patchMask = ((static_cast(1) << patchBitSize) - 1); + + int64_t gap = 0; + int64_t patch = 0; + uint64_t patchIdx = 0; + adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); + + for (uint64_t i = 0; i < runLength; ++i) { + if (static_cast(i) != gap) { + // no patching required. add base to unpacked value to get final value + literals[i] += base; + } else { + // extract the patch value + int64_t patchedVal = literals[i] | (patch << bitSize); + + // add base to patched value + literals[i] = base + patchedVal; + + // increment the patch to point to next entry in patch list + ++patchIdx; + + if (patchIdx < unpackedPatch.size()) { + adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, + &patchIdx); + + // next gap is relative to the current gap + gap += i; + } + } + } + } + + return copyDataFromBuffer(data, offset, numValues, notNull); +} + +uint64_t RleDecoderV2::nextDelta(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + uint32_t bitSize; + if (fbo != 0) { + bitSize = decodeBitWidth(fbo); + } else { + bitSize = 0; + } + + // extract the run length + runLength = static_cast(firstByte & 0x01) << 8; + runLength |= readByte(); + ++runLength; // account for first value + runRead = 0; + + int64_t prevValue; + // read the first value stored as vint + if (isSigned) { + prevValue = readVslong(); + } else { + prevValue = static_cast(readVulong()); + } + + literals[0] = prevValue; + + // read the fixed delta value stored as vint (deltas can be negative even + // if all number are positive) + int64_t deltaBase = readVslong(); + + if (bitSize == 0) { + // add fixed deltas to adjacent values + for (uint64_t i = 1; i < runLength; ++i) { + literals[i] = literals[i - 1] + deltaBase; + } + } else { + prevValue = literals[1] = prevValue + deltaBase; + if (runLength < 2) { + std::stringstream ss; + ss << "Illegal run length for delta encoding: " << runLength; + throw ParseError(ss.str()); + } + // write the unpacked values, add it to previous value and store final + // value to result buffer. if the delta base value is negative then it + // is a decreasing sequence else an increasing sequence. + // read deltas using the literals buffer. + readLongs(literals.data(), 2, runLength - 2, bitSize); + if (deltaBase < 0) { + for (uint64_t i = 2; i < runLength; ++i) { + prevValue = literals[i] = prevValue - literals[i]; + } + } else { + for (uint64_t i = 2; i < runLength; ++i) { + prevValue = literals[i] = prevValue + literals[i]; + } + } + } + } + + return copyDataFromBuffer(data, offset, numValues, notNull); +} + +uint64_t RleDecoderV2::copyDataFromBuffer(int64_t* data, uint64_t offset, + uint64_t numValues, const char* notNull) { + uint64_t nRead = std::min(runLength - runRead, numValues); + if (notNull) { + for (uint64_t i = offset; i < (offset + nRead); ++i) { + if (notNull[i]) { + data[i] = literals[runRead++]; } } - return nRead; + } else { + memcpy(data + offset, literals.data() + runRead, nRead * sizeof(int64_t)); + runRead += nRead; } + return nRead; +} } // namespace orc diff --git a/c++/src/VectorDecoder.hh b/c++/src/VectorDecoder.hh new file mode 100644 index 0000000000..8100c9e698 --- /dev/null +++ b/c++/src/VectorDecoder.hh @@ -0,0 +1,506 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef VECTOR_DECODER_HH +#define VECTOR_DECODER_HH + +#include +#include + +namespace orc { +#if ENABLE_AVX512 +#define ORC_VECTOR_BITS_2_BYTE(x) (((x) + 7u) >> 3u) /**< Convert a number of bits to a number of bytes */ +#define ORC_VECTOR_ONE_64U (1ULL) +#define ORC_VECTOR_MAX_16U 0xFFFF /**< Max value for uint16_t */ +#define ORC_VECTOR_MAX_32U 0xFFFFFFFF /**< Max value for uint32_t */ +#define ORC_VECTOR_BYTE_WIDTH 8u /**< Byte width in bits */ +#define ORC_VECTOR_WORD_WIDTH 16u /**< Word width in bits */ +#define ORC_VECTOR_DWORD_WIDTH 32u /**< Dword width in bits */ +#define ORC_VECTOR_QWORD_WIDTH 64u /**< Qword width in bits */ +#define ORC_VECTOR_BIT_MASK(x) ((ORC_VECTOR_ONE_64U << (x)) - 1u) /**< Bit mask below bit position */ + +#define ORC_VECTOR_BITS_2_WORD(x) (((x) + 15u) >> 4u) /**< Convert a number of bits to a number of words */ +#define ORC_VECTOR_BITS_2_DWORD(x) (((x) + 31u) >> 5u) /**< Convert a number of bits to a number of double words */ + +// ------------------------------------ 3u ----------------------------------------- +static uint8_t shuffleIdxTable3u_0[64] = { + 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, + 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, + 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, + 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u}; +static uint8_t shuffleIdxTable3u_1[64] = { + 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, + 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, + 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, + 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u}; +static uint16_t shiftTable3u_0[32] = { + 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u}; +static uint16_t shiftTable3u_1[32] = { + 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; +static uint16_t permutexIdxTable3u[32] = { + 0u, 1u, 2u, 0x0, 0x0, 0x0, 0x0, 0x0, 3u, 4u, 5u, 0x0, 0x0, 0x0, 0x0, 0x0, + 6u, 7u, 8u, 0x0, 0x0, 0x0, 0x0, 0x0, 9u, 10u, 11u, 0x0, 0x0, 0x0, 0x0, 0x0}; + +// ------------------------------------ 5u ----------------------------------------- +static uint8_t shuffleIdxTable5u_0[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; +static uint8_t shuffleIdxTable5u_1[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, + 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, + 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, + 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u}; +static uint16_t shiftTable5u_0[32] = { + 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u}; +static uint16_t shiftTable5u_1[32] = { + 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u}; +static uint16_t permutexIdxTable5u[32] = { + 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, + 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + +// ------------------------------------ 6u ----------------------------------------- +static uint8_t shuffleIdxTable6u_0[64] = { + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; +static uint8_t shuffleIdxTable6u_1[64] = { + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u}; +static uint16_t shiftTable6u_0[32] = { + 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u}; +static uint16_t shiftTable6u_1[32] = { + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; +static uint32_t permutexIdxTable6u[16] = { + 0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + +// ------------------------------------ 7u ----------------------------------------- +static uint8_t shuffleIdxTable7u_0[64] = { + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u}; +static uint8_t shuffleIdxTable7u_1[64] = { + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u}; +static uint16_t shiftTable7u_0[32] = { + 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u}; +static uint16_t shiftTable7u_1[32] = { + 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; +static uint16_t permutexIdxTable7u[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, + 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + +// ------------------------------------ 9u ----------------------------------------- +static uint16_t permutexIdxTable9u_0[32] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 13u, 14u, 14u, 15u, 15u, 16u, 16u, 17u}; +static uint16_t permutexIdxTable9u_1[32] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u, 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 14u, 15u, 15u, 16u, 16u, 17u, 17u, 18u}; +static uint32_t shiftTable9u_0[16] = { + 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; +static uint32_t shiftTable9u_1[16] = { + 7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u}; + +static uint8_t shuffleIdxTable9u_0[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u}; +static uint16_t shiftTable9u_2[32] = { + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; +static uint64_t gatherIdxTable9u[8] = { + 0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; + +// ------------------------------------ 10u ----------------------------------------- +static uint8_t shuffleIdxTable10u_0[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; +static uint16_t shiftTable10u[32] = { + 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; +static uint16_t permutexIdxTable10u[32] = { + 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, + 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + +// ------------------------------------ 11u ----------------------------------------- +static uint16_t permutexIdxTable11u_0[32] = { + 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u, 12u, 13u, 13u, 14u, 15u, 16u, 16u, 17u, 17u, 18u, 19u, 20u, 20u, 21u}; +static uint16_t permutexIdxTable11u_1[32] = { + 0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u, 11u, 12u, 13u, 14u, 14u, 15u, 15u, 16u, 17u, 18u, 18u, 19u, 19u, 20u, 21u, 22u}; +static uint32_t shiftTable11u_0[16] = { + 0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u, 0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u}; +static uint32_t shiftTable11u_1[16] = { + 5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u, 5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u}; + +static uint8_t shuffleIdxTable11u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; +static uint8_t shuffleIdxTable11u_1[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u}; +static uint32_t shiftTable11u_2[16] = { + 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u}; +static uint32_t shiftTable11u_3[16] = { + 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u}; +static uint64_t gatherIdxTable11u[8] = { + 0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; + +// ------------------------------------ 12u ----------------------------------------- +static uint8_t shuffleIdxTable12u_0[64] = { + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; +static uint16_t shiftTable12u[32] = { + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; +static uint32_t permutexIdxTable12u[16] = { + 0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + +// ------------------------------------ 13u ----------------------------------------- +static uint16_t permutexIdxTable13u_0[32] = { + 0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u, + 13u, 14u, 14u, 15u, 16u, 17u, 17u, 18u, 19u, 20u, 21u, 22u, 22u, 23u, 24u, 25u}; +static uint16_t permutexIdxTable13u_1[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u, + 13u, 14u, 15u, 16u, 17u, 18u, 18u, 19u, 20u, 21u, 21u, 22u, 23u, 24u, 25u, 26u}; +static uint32_t shiftTable13u_0[16] = { + 0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u, 0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u}; +static uint32_t shiftTable13u_1[16] = { + 3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u, 3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u}; + +static uint8_t shuffleIdxTable13u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; +static uint8_t shuffleIdxTable13u_1[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u}; +static uint32_t shiftTable13u_2[16] = { + 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u}; +static uint32_t shiftTable13u_3[16] = { + 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u}; +static uint64_t gatherIdxTable13u[8] = { + 0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; + +// ------------------------------------ 14u ----------------------------------------- +static uint8_t shuffleIdxTable14u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; +static uint8_t shuffleIdxTable14u_1[64] = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u}; +static uint32_t shiftTable14u_0[16] = { + 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u}; +static uint32_t shiftTable14u_1[16] = { + 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; +static uint16_t permutexIdxTable14u[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, + 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + +// ------------------------------------ 15u ----------------------------------------- +static uint16_t permutexIdxTable15u_0[32] = { + 0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, + 15u, 16u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u}; +static uint16_t permutexIdxTable15u_1[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, + 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u, 30u}; +static uint32_t shiftTable15u_0[16] = { + 0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u}; +static uint32_t shiftTable15u_1[16] = { + 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u, 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; + +static uint8_t shuffleIdxTable15u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u}; +static uint8_t shuffleIdxTable15u_1[64] = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u}; +static uint32_t shiftTable15u_2[16] = { + 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u}; +static uint32_t shiftTable15u_3[16] = { + 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; +static uint64_t gatherIdxTable15u[8] = { + 0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; + +// ------------------------------------ 17u ----------------------------------------- +static uint32_t permutexIdxTable17u_0[16] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; +static uint32_t permutexIdxTable17u_1[16] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; +static uint64_t shiftTable17u_0[8] = { + 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; +static uint64_t shiftTable17u_1[8] = { + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; + +static uint8_t shuffleIdxTable17u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; +static uint32_t shiftTable17u_2[16] = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u}; +static uint64_t gatherIdxTable17u[8] = { + 0u, 8u, 8u, 16u, 17u, 25u, 25u, 33u}; + +// ------------------------------------ 18u ----------------------------------------- +static uint32_t permutexIdxTable18u_0[16] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; +static uint32_t permutexIdxTable18u_1[16] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; +static uint64_t shiftTable18u_0[8] = { + 0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u}; +static uint64_t shiftTable18u_1[8] = { + 14u, 10u, 6u, 2u, 30u, 26u, 22u, 18u}; + +static uint8_t shuffleIdxTable18u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; +static uint32_t shiftTable18u_2[16] = { + 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; +static uint64_t gatherIdxTable18u[8] = { + 0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; + +// ------------------------------------ 19u ----------------------------------------- +static uint32_t permutexIdxTable19u_0[16] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u}; +static uint32_t permutexIdxTable19u_1[16] = { + 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; +static uint64_t shiftTable19u_0[8] = { + 0u, 6u, 12u, 18u, 24u, 30u, 4u, 10u}; +static uint64_t shiftTable19u_1[8] = { + 13u, 7u, 1u, 27u, 21u, 15u, 9u, 3u}; + +static uint8_t shuffleIdxTable19u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; +static uint32_t shiftTable19u_2[16] = { + 13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u, 13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u}; +static uint64_t gatherIdxTable19u[8] = { + 0u, 8u, 9u, 17u, 19u, 27u, 28u, 36u}; + +// ------------------------------------ 20u ----------------------------------------- +static uint8_t shuffleIdxTable20u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; +static uint32_t shiftTable20u[16] = { + 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; +static uint16_t permutexIdxTable20u[32] = { + 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, + 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + +// ------------------------------------ 21u ----------------------------------------- +static uint32_t permutexIdxTable21u_0[16] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u}; +static uint32_t permutexIdxTable21u_1[16] = { + 0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 9u, 10u}; +static uint64_t shiftTable21u_0[8] = { + 0u, 10u, 20u, 30u, 8u, 18u, 28u, 6u}; +static uint64_t shiftTable21u_1[8] = { + 11u, 1u, 23u, 13u, 3u, 25u, 15u, 5u}; + +static uint8_t shuffleIdxTable21u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; +static uint32_t shiftTable21u_2[16] = { + 11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u, 11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u}; +static uint64_t gatherIdxTable21u[8] = { + 0u, 8u, 10u, 18u, 21u, 29u, 31u, 39u}; + +// ------------------------------------ 22u ----------------------------------------- +static uint32_t permutexIdxTable22u_0[16] = { + 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u}; +static uint32_t permutexIdxTable22u_1[16] = { + 0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u}; +static uint64_t shiftTable22u_0[8] = { + 0u, 12u, 24u, 4u, 16u, 28u, 8u, 20u}; +static uint64_t shiftTable22u_1[8] = { + 10u, 30u, 18u, 6u, 26u, 14u, 2u, 22u}; + +static uint8_t shuffleIdxTable22u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; +static uint32_t shiftTable22u_2[16] = { + 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u}; +static uint64_t gatherIdxTable22u[8] = { + 0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; + +// ------------------------------------ 23u ----------------------------------------- +static uint32_t permutexIdxTable23u_0[16] = { + 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u}; +static uint32_t permutexIdxTable23u_1[16] = { + 0u, 1u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u}; +static uint64_t shiftTable23u_0[8] = { + 0u, 14u, 28u, 10u, 24u, 6u, 20u, 2u}; +static uint64_t shiftTable23u_1[8] = { + 9u, 27u, 13u, 31u, 17u, 3u, 21u, 7u}; + +static uint8_t shuffleIdxTable23u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; +static uint32_t shiftTable23u_2[16] = { + 9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u}; +static uint64_t gatherIdxTable23u[8] = { + 0u, 8u, 11u, 19u, 23u, 31u, 34u, 42u}; + +// ------------------------------------ 24u ----------------------------------------- +static uint8_t shuffleIdxTable24u_0[64] = { + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF}; +static uint32_t permutexIdxTable24u[16] = { + 0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + +// ------------------------------------ 26u ----------------------------------------- +static uint32_t permutexIdxTable26u_0[16] = { + 0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u}; +static uint32_t permutexIdxTable26u_1[16] = { + 0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u}; +static uint64_t shiftTable26u_0[8] = { + 0u, 20u, 8u, 28u, 16u, 4u, 24u, 12u}; +static uint64_t shiftTable26u_1[8] = { + 6u, 18u, 30u, 10u, 22u, 2u, 14u, 26u}; + +static uint8_t shuffleIdxTable26u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; +static uint32_t shiftTable26u_2[16] = { + 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; +static uint64_t gatherIdxTable26u[8] = { + 0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; + +// ------------------------------------ 28u ----------------------------------------- +static uint8_t shuffleIdxTable28u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; +static uint32_t shiftTable28u[16] = { + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; +static uint16_t permutexIdxTable28u[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, + 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + +// ------------------------------------ 30u ----------------------------------------- +static uint32_t permutexIdxTable30u_0[16] = { + 0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u}; +static uint32_t permutexIdxTable30u_1[16] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u}; +static uint64_t shiftTable30u_0[8] = { + 0u, 28u, 24u, 20u, 16u, 12u, 8u, 4u}; +static uint64_t shiftTable30u_1[8] = { + 2u, 6u, 10u, 14u, 18u, 22u, 26u, 30u}; + +static uint8_t shuffleIdxTable30u_0[64] = { + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u}; +static uint8_t shuffleIdxTable30u_1[64] = { + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u}; +static uint64_t shiftTable30u_2[8] = { + 34u, 30u, 34u, 30u, 34u, 30u, 34u, 30u}; +static uint64_t shiftTable30u_3[8] = { + 28u, 24u, 28u, 24u, 28u, 24u, 28u, 24u}; +static uint64_t gatherIdxTable30u[8] = { + 0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; + +static uint64_t nibbleReverseTable[8] = { + 0x0E060A020C040800, + 0x0F070B030D050901, + 0x0E060A020C040800, + 0x0F070B030D050901, + 0x0E060A020C040800, + 0x0F070B030D050901, + 0x0E060A020C040800, + 0x0F070B030D050901 +}; + +static uint64_t reverseMaskTable1u[8] = { + 0x0001020304050607, + 0x08090A0B0C0D0E0F, + 0x1011121314151617, + 0x18191A1B1C1D1E1F, + 0x2021222324252627, + 0x28292A2B2C2D2E2F, + 0x3031323334353637, + 0x38393A3B3C3D3E3F +}; + +static uint64_t reverseMaskTable16u[8] = { + 0x0607040502030001, + 0x0E0F0C0D0A0B0809, + 0x1617141512131011, + 0x1E1F1C1D1A1B1819, + 0x2627242522232021, + 0x2E2F2C2D2A2B2829, + 0x3637343532333031, + 0x3E3F3C3D3A3B3839 +}; + +static uint64_t reverseMaskTable32u[8] = { + 0x0405060700010203, + 0x0C0D0E0F08090A0B, + 0x1415161710111213, + 0x1C1D1E1F18191A1B, + 0x2425262720212223, + 0x2C2D2E2F28292A2B, + 0x3435363730313233, + 0x3C3D3E3F38393A3B +}; + +uint32_t getAlign (uint32_t start_bit, uint32_t base, uint32_t bitsize) { + uint32_t remnant = bitsize - start_bit; + uint32_t ret_value = 0xFFFFFFFF; + for (uint32_t i = 0u; i < bitsize; ++i) { + uint32_t test_value = (i * base) % bitsize; + if (test_value == remnant) { + ret_value = i; + break; + } + } + return ret_value; +} + +inline uint64_t moveLen(uint64_t x, uint64_t y) { + uint64_t result = 0; + if (x % y == 0) { + result = x / y; + } else { + result = x / y + 1; + } + return result; +} +#endif +} +#endif diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index ed2715bf52..4cf55ddf80 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -16,13 +16,12 @@ include_directories( ${PROJECT_BINARY_DIR}/c++/src ) -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX11_FLAGS} ${WARN_FLAGS}") add_executable (orc-test MemoryInputStream.cc MemoryOutputStream.cc TestAttributes.cc - TestBlockBuffer.cc TestBufferedOutputStream.cc TestBloomFilter.cc TestByteRle.cc @@ -42,6 +41,7 @@ add_executable (orc-test TestReader.cc TestRleDecoder.cc TestRleEncoder.cc + TestRleVectorDecoder.cc TestRLEV2Util.cc TestSargsApplier.cc TestSearchArgument.cc @@ -58,7 +58,6 @@ target_link_libraries (orc-test orc::protobuf orc::snappy orc::zlib - orc::gtest orc::gmock ) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc new file mode 100644 index 0000000000..05cc31cd31 --- /dev/null +++ b/c++/test/TestRleVectorDecoder.cc @@ -0,0 +1,639 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "MemoryOutputStream.hh" +#include "RLEv2.hh" + +#include "wrap/orc-proto-wrapper.hh" +#include "wrap/gtest-wrapper.h" + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") +#endif + +namespace orc { + + using ::testing::TestWithParam; + using ::testing::Values; + + const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + + + class RleVectorTest : public TestWithParam { + virtual void SetUp(); + + protected: + bool alignBitpacking; + std::unique_ptr getEncoder(RleVersion version, + MemoryOutputStream& memStream, + bool isSigned); + + void runExampleTest(int64_t* inputData, uint64_t inputLength, + unsigned char* expectedOutput, uint64_t outputLength); + + void runTest(RleVersion version, + uint64_t numValues, + int64_t start, + int64_t delta, + bool random, + bool isSigned, + uint8_t bitWidth, + uint64_t blockSize = 0, + uint64_t numNulls = 0); + }; + + void vectorDecodeAndVerify( + RleVersion version, + const MemoryOutputStream& memStream, + int64_t * data, + uint64_t numValues, + const char* notNull, + uint64_t blockSize, + bool isSinged) { + std::unique_ptr decoder = createRleDecoder( + std::unique_ptr(new SeekableArrayInputStream( + memStream.getData(), + memStream.getLength(), blockSize)), + isSinged, version, *getDefaultPool(), + getDefaultReaderMetrics()); + + int64_t* decodedData = new int64_t[numValues]; + decoder->next(decodedData, numValues, notNull); + + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + EXPECT_EQ(data[i], decodedData[i]); + } + } + + delete [] decodedData; + } + + void RleVectorTest::SetUp() { + alignBitpacking = GetParam(); + } + + void generateDataFolBits( + uint64_t numValues, + int64_t start, + int64_t delta, + bool random, + int64_t* data, + uint8_t bitWidth, + uint64_t numNulls = 0, + char* notNull = nullptr) { + int64_t max = pow(2, bitWidth); + if (numNulls != 0 && notNull != nullptr) { + memset(notNull, 1, numValues); + while (numNulls > 0) { + uint64_t pos = static_cast(std::rand()) % numValues; + if (notNull[pos]) { + notNull[pos] = static_cast(0); + --numNulls; + } + } + } + + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + if (!random) { + data[i] = start + delta * static_cast(i); + } else { + data[i] = std::rand()%max; + } + } + } + } + +#define BARSTR "##################################################" +#define BARWIDTH 50 + void testProgress(const char* testName, int64_t offset, int64_t total) { + int32_t val = offset * 100 / total; + int32_t lpad = offset * BARWIDTH / total; + int32_t rpad = BARWIDTH - lpad; + + printf("\r%s:%3d%% [%.*s%*s] [%ld/%ld]", testName, val, lpad, BARSTR, rpad, "", offset, total); + fflush(stdout); + } + + std::unique_ptr RleVectorTest::getEncoder(RleVersion version, + MemoryOutputStream& memStream, + bool isSigned) + { + MemoryPool * pool = getDefaultPool(); + + return createRleEncoder( + std::unique_ptr( + new BufferedOutputStream( + *pool, &memStream, 500 * 1024, 1024, nullptr)), + isSigned, version, *pool, alignBitpacking); + } + + void RleVectorTest::runTest(RleVersion version, + uint64_t numValues, + int64_t start, + int64_t delta, + bool random, + bool isSigned, + uint8_t bitWidth, + uint64_t blockSize, + uint64_t numNulls) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + + std::unique_ptr encoder = getEncoder(version, memStream, isSigned); + + char* notNull = numNulls == 0 ? nullptr : new char[numValues]; + int64_t* data = new int64_t[numValues]; + generateDataFolBits(numValues, start, delta, random, data, bitWidth, numNulls, notNull); + encoder->add(data, numValues, notNull); + encoder->flush(); + + vectorDecodeAndVerify(version, memStream, data, numValues, notNull, blockSize, isSigned); + delete [] data; + delete [] notNull; + } + +#if ENABLE_AVX512 + TEST_P(RleVectorTest, RleV2_basic_vector_decode_1bit) { + uint8_t bitWidth = 1; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("1bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("1bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_2bit) { + uint8_t bitWidth = 2; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("2bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("2bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_3bit) { + uint8_t bitWidth = 3; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("3bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("3bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_4bit) { + uint8_t bitWidth = 4; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("4bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("4bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_5bit) { + uint8_t bitWidth = 5; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("5bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("5bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_6bit) { + uint8_t bitWidth = 6; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("6bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("6bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_7bit) { + uint8_t bitWidth = 7; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("7bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("7bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_9bit) { + uint8_t bitWidth = 9; + + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("9bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("9bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_10bit) { + uint8_t bitWidth = 10; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("10bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("10bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_11bit) { + uint8_t bitWidth = 11; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("11bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("11bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_12bit) { + uint8_t bitWidth = 12; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("12bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("12bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_13bit) { + uint8_t bitWidth = 13; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("13bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("13bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_14bit) { + uint8_t bitWidth = 14; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("14bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("14bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_15bit) { + uint8_t bitWidth = 15; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("15bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("15bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_16bit) { + uint8_t bitWidth = 16; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("16bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("16bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_17bit) { + uint8_t bitWidth = 17; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("17bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("17bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_18bit) { + uint8_t bitWidth = 18; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("18bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("18bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_19bit) { + uint8_t bitWidth = 19; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("19bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("19bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_20bit) { + uint8_t bitWidth = 20; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("20bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("20bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_21bit) { + uint8_t bitWidth = 21; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("21bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("21bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_22bit) { + uint8_t bitWidth = 22; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("22bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("22bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_23bit) { + uint8_t bitWidth = 23; + runTest(RleVersion_2, 3277, 0, 0, true, false, bitWidth, 108); + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("23bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("23bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_24bit) { + uint8_t bitWidth = 24; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("24bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("24bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_26bit) { + uint8_t bitWidth = 26; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("26bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("26bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_28bit) { + uint8_t bitWidth = 28; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("28bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("28bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_30bit) { + uint8_t bitWidth = 30; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("30bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("30bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } + + TEST_P(RleVectorTest, RleV2_basic_vector_decode_32bit) { + uint8_t bitWidth = 32; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("32bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("32bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); + } +#endif + + INSTANTIATE_TEST_CASE_P(OrcTest, RleVectorTest, Values(true,false)); +} + From acbc2145bfd5d39a054d10d99e1f381085e4e6c5 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Tue, 10 Jan 2023 12:13:50 +0530 Subject: [PATCH 02/80] Fix some conficts. --- CMakeLists.txt | 34 +- c++/src/RLEv2.hh | 398 +-- c++/src/RleDecoderV2.cc | 5368 +++++---------------------------------- 3 files changed, 851 insertions(+), 4949 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7f55f11fb..b936062b19 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,6 +94,14 @@ endif () # # Compiler specific flags # +# This ensures that things like c++17 get passed correctly +if(NOT DEFINED CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +elseif(${CMAKE_CXX_STANDARD} VERSION_LESS 17) + message(FATAL_ERROR "Cannot set a CMAKE_CXX_STANDARD smaller than 17") +endif() +# We require a C++17 compliant compiler +set(CMAKE_CXX_STANDARD_REQUIRED ON) if (NOT MSVC) set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-omit-frame-pointer") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG -fno-omit-frame-pointer") @@ -101,7 +109,12 @@ if (NOT MSVC) endif () message(STATUS "compiler ${CMAKE_CXX_COMPILER_ID} version ${CMAKE_CXX_COMPILER_VERSION}") if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set (CXX11_FLAGS "-std=c++11") + if (CMAKE_CXX_COMPILER_VERSION STREQUAL "" OR + CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0") + message(FATAL_ERROR "A c++17-compliant compiler is required, please use at least Clang 5") + else () + set (CXX17_FLAGS "-std=c++17") + endif () set (WARN_FLAGS "-Weverything -Wno-c++98-compat -Wno-missing-prototypes") set (WARN_FLAGS "${WARN_FLAGS} -Wno-c++98-compat-pedantic -Wno-padded") set (WARN_FLAGS "${WARN_FLAGS} -Wno-covered-switch-default") @@ -120,22 +133,27 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") set (WARN_FLAGS "${WARN_FLAGS} -Werror") endif () elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if (CMAKE_CXX_COMPILER_VERSION STREQUAL "" OR + CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0") + message(FATAL_ERROR "A c++17-compliant compiler is required, please use at least GCC 5") + else () + set (CXX17_FLAGS "-std=c++17") + endif () set (WARN_FLAGS "-Wall -Wno-unknown-pragmas -Wno-conversion") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "12.0") set (WARN_FLAGS "${WARN_FLAGS} -Wno-array-bounds -Wno-stringop-overread") # To compile protobuf in Fedora37 - elseif (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9") - set (WARN_FLAGS "${WARN_FLAGS} -Wno-unused-function") endif () if (STOP_BUILD_ON_WARNING) set (WARN_FLAGS "${WARN_FLAGS} -Werror") endif () - if (CMAKE_CXX_COMPILER_VERSION STREQUAL "" OR - CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.7") - set (CXX11_FLAGS "-std=c++0x") +elseif (MSVC) + include(CheckCXXCompilerFlag) + CHECK_CXX_COMPILER_FLAG("/std:c++17" CPP17_FLAG_SUPPORTED) + if (CPP17_FLAG_SUPPORTED) + add_compile_options("/std:c++17") else () - set (CXX11_FLAGS "-std=c++11") + message(FATAL_ERROR "A c++17-compliant compiler is required") endif () -elseif (MSVC) add_definitions (-D_SCL_SECURE_NO_WARNINGS) add_definitions (-D_CRT_SECURE_NO_WARNINGS) add_definitions (-D_CRT_NONSTDC_NO_DEPRECATE) # The POSIX name for this item is deprecated diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index b766d2ae60..b2654b14fe 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -1,27 +1,27 @@ /** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef ORC_RLEV2_HH #define ORC_RLEV2_HH #include "Adaptor.hh" -#include "orc/Exceptions.hh" #include "RLE.hh" +#include "orc/Exceptions.hh" #include @@ -33,46 +33,76 @@ #define HIST_LEN 32 namespace orc { -struct FixedBitSizes { + struct FixedBitSizes { enum FBS { - ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, - THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, - TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX, - TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE + ONE = 0, + TWO, + THREE, + FOUR, + FIVE, + SIX, + SEVEN, + EIGHT, + NINE, + TEN, + ELEVEN, + TWELVE, + THIRTEEN, + FOURTEEN, + FIFTEEN, + SIXTEEN, + SEVENTEEN, + EIGHTEEN, + NINETEEN, + TWENTY, + TWENTYONE, + TWENTYTWO, + TWENTYTHREE, + TWENTYFOUR, + TWENTYSIX, + TWENTYEIGHT, + THIRTY, + THIRTYTWO, + FORTY, + FORTYEIGHT, + FIFTYSIX, + SIXTYFOUR, + SIZE }; -}; - -enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 }; - -struct EncodingOption { - EncodingType encoding; - int64_t fixedDelta; - int64_t gapVsPatchListCount; - int64_t zigzagLiteralsCount; - int64_t baseRedLiteralsCount; - int64_t adjDeltasCount; - uint32_t zzBits90p; - uint32_t zzBits100p; - uint32_t brBits95p; - uint32_t brBits100p; - uint32_t bitsDeltaMax; - uint32_t patchWidth; - uint32_t patchGapWidth; - uint32_t patchLength; - int64_t min; - bool isFixedDelta; -}; - -class RleEncoderV2 : public RleEncoder { -public: - RleEncoderV2(std::unique_ptr outStream, bool hasSigned, bool alignBitPacking = true); + }; + + enum EncodingType { SHORT_REPEAT = 0, DIRECT = 1, PATCHED_BASE = 2, DELTA = 3 }; + + struct EncodingOption { + EncodingType encoding; + int64_t fixedDelta; + int64_t gapVsPatchListCount; + int64_t zigzagLiteralsCount; + int64_t baseRedLiteralsCount; + int64_t adjDeltasCount; + uint32_t zzBits90p; + uint32_t zzBits100p; + uint32_t brBits95p; + uint32_t brBits100p; + uint32_t bitsDeltaMax; + uint32_t patchWidth; + uint32_t patchGapWidth; + uint32_t patchLength; + int64_t min; + bool isFixedDelta; + }; + + class RleEncoderV2 : public RleEncoder { + public: + RleEncoderV2(std::unique_ptr outStream, bool hasSigned, + bool alignBitPacking = true); ~RleEncoderV2() override { - delete [] literals; - delete [] gapVsPatchList; - delete [] zigzagLiterals; - delete [] baseRedLiterals; - delete [] adjDeltas; + delete[] literals; + delete[] gapVsPatchList; + delete[] zigzagLiterals; + delete[] baseRedLiterals; + delete[] adjDeltas; } /** * Flushing underlying BufferedOutputStream @@ -81,20 +111,19 @@ public: void write(int64_t val) override; -private: - + private: const bool alignedBitPacking; uint32_t fixedRunLength; uint32_t variableRunLength; int64_t prevDelta; int32_t histgram[HIST_LEN]; - // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val), - // it is move here for performance consideration. + // The four list below should actually belong to EncodingOption since it only holds temporal + // values in write(int64_t val), it is move here for performance consideration. int64_t* gapVsPatchList; - int64_t* zigzagLiterals; - int64_t* baseRedLiterals; - int64_t* adjDeltas; + int64_t* zigzagLiterals; + int64_t* baseRedLiterals; + int64_t* adjDeltas; uint32_t getOpCode(EncodingType encoding); int64_t* prepareForDirectOrPatchedBase(EncodingOption& option); @@ -109,136 +138,139 @@ private: void writeDirectValues(EncodingOption& option); void writePatchedBasedValues(EncodingOption& option); void writeDeltaValues(EncodingOption& option); - uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false); -}; - -class RleDecoderV2 : public RleDecoder { -public: - RleDecoderV2(std::unique_ptr input, - bool isSigned, MemoryPool& pool, - ReaderMetrics* metrics); - - /** - * Seek to a particular spot. - */ - void seek(PositionProvider&) override; - - /** - * Seek over a given number of values. - */ - void skip(uint64_t numValues) override; - - /** - * Read a number of values into the batch. - */ - void next(int64_t* data, uint64_t numValues, - const char* notNull) override; - -private: - - /** - * Decode the next gap and patch from 'unpackedPatch' and update the index on it. - * Used by PATCHED_BASE. - * - * @param patchBitSize bit size of the patch value - * @param patchMask mask for the patch value - * @param resGap result of gap - * @param resPatch result of patch - * @param patchIdx current index in the 'unpackedPatch' buffer - */ - void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, - int64_t* resGap, int64_t* resPatch, uint64_t* patchIdx); - - void resetReadLongs() { - bitsLeft = 0; - curByte = 0; - } - - void resetRun() { - resetReadLongs(); - } - - void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); - - unsigned char readByte(); - - int64_t readLongBE(uint64_t bsz); - int64_t readVslong(); - uint64_t readVulong(); - void readLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs); - void plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs, + uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, + bool reuseHist = false); + }; + + class RleDecoderV2 : public RleDecoder { + public: + RleDecoderV2(std::unique_ptr input, bool isSigned, MemoryPool& pool, + ReaderMetrics* metrics); + + /** + * Seek to a particular spot. + */ + void seek(PositionProvider&) override; + + /** + * Seek over a given number of values. + */ + void skip(uint64_t numValues) override; + + /** + * Read a number of values into the batch. + */ + template + void next(T* data, uint64_t numValues, const char* notNull); + + void next(int64_t* data, uint64_t numValues, const char* notNull) override; + + void next(int32_t* data, uint64_t numValues, const char* notNull) override; + + void next(int16_t* data, uint64_t numValues, const char* notNull) override; + + private: + /** + * Decode the next gap and patch from 'unpackedPatch' and update the index on it. + * Used by PATCHED_BASE. + * + * @param patchBitSize bit size of the patch value + * @param patchMask mask for the patch value + * @param resGap result of gap + * @param resPatch result of patch + * @param patchIdx current index in the 'unpackedPatch' buffer + */ + void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap, + int64_t* resPatch, uint64_t* patchIdx); + + void resetReadLongs() { + bitsLeft = 0; + curByte = 0; + } + + void resetRun() { + resetReadLongs(); + } + + void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); + unsigned char readByte(); + + int64_t readLongBE(uint64_t bsz); + int64_t readVslong(); + uint64_t readVulong(); + void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + void plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs, uint64_t& startBit); #if ENABLE_AVX512 - void unrolledUnpackVector1(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector2(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector3(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector4(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector5(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector6(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector7(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector9(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector10(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector11(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector12(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector13(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector14(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector15(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector16(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector17(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector18(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector19(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector20(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector21(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector22(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector23(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector24(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector26(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector28(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector30(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector32(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector1(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector2(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector3(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector4(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector5(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector6(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector7(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector9(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector10(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector11(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector12(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector13(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector14(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector15(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector16(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector17(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector18(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector19(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector20(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector21(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector22(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector23(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector24(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector26(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector28(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector30(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector32(int64_t *data, uint64_t offset, uint64_t len); #endif - void unrolledUnpack4(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack8(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack16(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack24(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack32(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack40(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len); - - uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - - uint64_t copyDataFromBuffer(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - - const std::unique_ptr inputStream; - const bool isSigned; - - unsigned char firstByte; - uint64_t runLength; // Length of the current run - uint64_t runRead; // Number of returned values of the current run - const char *bufferStart; - const char *bufferEnd; - uint32_t bitsLeft; // Used by readLongs when bitSize < 8 - uint32_t curByte; // Used by anything that uses readLongs - DataBuffer unpackedPatch; // Used by PATCHED_BASE - DataBuffer literals; // Values of the current run + void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len); + + template + uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + template + uint64_t nextDirect(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + template + uint64_t nextPatched(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + template + uint64_t nextDelta(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + template + uint64_t copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull); + + const std::unique_ptr inputStream; + const bool isSigned; + + unsigned char firstByte; + uint64_t runLength; // Length of the current run + uint64_t runRead; // Number of returned values of the current run + const char* bufferStart; + const char* bufferEnd; + uint32_t bitsLeft; // Used by readLongs when bitSize < 8 + uint32_t curByte; // Used by anything that uses readLongs + DataBuffer unpackedPatch; // Used by PATCHED_BASE + DataBuffer literals; // Values of the current run #if ENABLE_AVX512 - uint8_t vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data - uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + 1]; // Used by vectorially 9~16 bit-unpacking data - uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + 1]; // Used by vectorially 17~32 bit-unpacking data + uint8_t vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data + uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + 1]; // Used by vectorially 9~16 bit-unpacking data + uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + 1]; // Used by vectorially 17~32 bit-unpacking data #endif -}; + }; } // namespace orc #endif // ORC_RLEV2_HH diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 6cfc7bf2e0..2742aef6f6 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -18,185 +18,55 @@ #include "Adaptor.hh" #include "Compression.hh" -#include "RLEv2.hh" #include "RLEV2Util.hh" -#include "VectorDecoder.hh" -#include "DetectPlatform.hh" +#include "RLEv2.hh" #include "Utils.hh" namespace orc { -void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { - uint64_t restLen = bufferEnd - bufferStart; - int bufferLength = 0; - const void* bufferPointer = nullptr; - - if (backupByteLen != 0) { - inputStream->BackUp(backupByteLen); - } - if (len >= restLen && resetBuf == true) { - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in RleDecoderV2::resetBufferStart"); + unsigned char RleDecoderV2::readByte() { + SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); + if (bufferStart == bufferEnd) { + int bufferLength; + const void* bufferPointer; + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::readByte"); + } + bufferStart = static_cast(bufferPointer); + bufferEnd = bufferStart + bufferLength; } - } - if (bufferPointer == nullptr) { - bufferStart += len; - } else { - bufferStart = static_cast(bufferPointer); - bufferEnd = bufferStart + bufferLength; + unsigned char result = static_cast(*bufferStart++); + return result; } -} -unsigned char RleDecoderV2::readByte() { - SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); - if (bufferStart == bufferEnd) { - int bufferLength; - const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in RleDecoderV2::readByte"); + int64_t RleDecoderV2::readLongBE(uint64_t bsz) { + int64_t ret = 0, val; + uint64_t n = bsz; + while (n > 0) { + n--; + val = readByte(); + ret |= (val << (n * 8)); } - bufferStart = static_cast(bufferPointer); - bufferEnd = bufferStart + bufferLength; + return ret; } - unsigned char result = static_cast(*bufferStart++); - return result; -} - -int64_t RleDecoderV2::readLongBE(uint64_t bsz) { - int64_t ret = 0, val; - uint64_t n = bsz; - while (n > 0) { - n--; - val = readByte(); - ret |= (val << (n * 8)); + inline int64_t RleDecoderV2::readVslong() { + return unZigZag(readVulong()); } - return ret; -} - -inline int64_t RleDecoderV2::readVslong() { - return unZigZag(readVulong()); -} -uint64_t RleDecoderV2::readVulong() { - uint64_t ret = 0, b; - uint64_t offset = 0; - do { - b = readByte(); - ret |= (0x7f & b) << offset; - offset += 7; - } while (b >= 0x80); - return ret; -} + uint64_t RleDecoderV2::readVulong() { + uint64_t ret = 0, b; + uint64_t offset = 0; + do { + b = readByte(); + ret |= (0x7f & b) << offset; + offset += 7; + } while (b >= 0x80); + return ret; + } -void RleDecoderV2::readLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs) { - uint64_t startBit = 0; -#if ENABLE_AVX512 - if (detect_platform() == arch_t::avx512_arch) { - switch (fbs) { - case 1: - unrolledUnpackVector1(data, offset, len); - return; - case 2: - unrolledUnpackVector2(data, offset, len); - return; - case 3: - unrolledUnpackVector3(data, offset, len); - return; - case 4: - unrolledUnpackVector4(data, offset, len); - return; - case 5: - unrolledUnpackVector5(data, offset, len); - return; - case 6: - unrolledUnpackVector6(data, offset, len); - return; - case 7: - unrolledUnpackVector7(data, offset, len); - return; - case 8: - unrolledUnpack8(data, offset, len); - return; - case 9: - unrolledUnpackVector9(data, offset, len); - return; - case 10: - unrolledUnpackVector10(data, offset, len); - return; - case 11: - unrolledUnpackVector11(data, offset, len); - return; - case 12: - unrolledUnpackVector12(data, offset, len); - return; - case 13: - unrolledUnpackVector13(data, offset, len); - return; - case 14: - unrolledUnpackVector14(data, offset, len); - return; - case 15: - unrolledUnpackVector15(data, offset, len); - return; - case 16: - unrolledUnpackVector16(data, offset, len); - return; - case 17: - unrolledUnpackVector17(data, offset, len); - return; - case 18: - unrolledUnpackVector18(data, offset, len); - return; - case 19: - unrolledUnpackVector19(data, offset, len); - return; - case 20: - unrolledUnpackVector20(data, offset, len); - return; - case 21: - unrolledUnpackVector21(data, offset, len); - return; - case 22: - unrolledUnpackVector22(data, offset, len); - return; - case 23: - unrolledUnpackVector23(data, offset, len); - return; - case 24: - unrolledUnpackVector24(data, offset, len); - return; - case 26: - unrolledUnpackVector26(data, offset, len); - return; - case 28: - unrolledUnpackVector28(data, offset, len); - return; - case 30: - unrolledUnpackVector30(data, offset, len); - return; - case 32: - unrolledUnpackVector32(data, offset, len); - return; - case 40: - unrolledUnpack40(data, offset, len); - return; - case 48: - unrolledUnpack48(data, offset, len); - return; - case 56: - unrolledUnpack56(data, offset, len); - return; - case 64: - unrolledUnpack64(data, offset, len); - return; - default: - // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs, startBit); - return; - } - } else { + void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { switch (fbs) { case 4: unrolledUnpack4(data, offset, len); @@ -227,4686 +97,668 @@ void RleDecoderV2::readLongs(int64_t *data, uint64_t offset, uint64_t len, uint6 return; default: // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs, startBit); + plainUnpackLongs(data, offset, len, fbs); return; } } -#else - switch (fbs) { - case 4: - unrolledUnpack4(data, offset, len); - return; - case 8: - unrolledUnpack8(data, offset, len); - return; - case 16: - unrolledUnpack16(data, offset, len); - return; - case 24: - unrolledUnpack24(data, offset, len); - return; - case 32: - unrolledUnpack32(data, offset, len); - return; - case 40: - unrolledUnpack40(data, offset, len); - return; - case 48: - unrolledUnpack48(data, offset, len); - return; - case 56: - unrolledUnpack56(data, offset, len); - return; - case 64: - unrolledUnpack64(data, offset, len); - return; - default: - // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs, startBit); - return; - } -#endif -} - -#if ENABLE_AVX512 -void RleDecoderV2::unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 1; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint32_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; + void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. + while (bitsLeft > 0 && curIdx < offset + len) { + bitsLeft -= 4; + data[curIdx++] = (curByte >> bitsLeft) & 15; } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __m512i reverseMask1u = _mm512_load_si512(reverseMaskTable1u); - while (numElements >= 64) { - uint64_t src_64 = *(uint64_t *)srcPtr; - // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF - __m512i srcmm = _mm512_movm_epi8(src_64); - // make 0x00 --> 0x00, 0xFF --> 0x01 - srcmm = _mm512_abs_epi8(srcmm); - srcmm = _mm512_shuffle_epi8(srcmm, reverseMask1u); - _mm512_storeu_si512(vectorBuf8, srcmm); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } + if (curIdx == offset + len) return; - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + // Exhaust the buffer + uint64_t numGroups = (offset + len - curIdx) / 2; + numGroups = std::min(numGroups, static_cast(bufferEnd - bufferStart)); + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + uint32_t localByte; + for (uint64_t i = 0; i < numGroups; ++i) { + localByte = *buffer++; + data[curIdx] = (localByte >> 4) & 15; + data[curIdx + 1] = localByte & 15; + curIdx += 2; } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + // readByte() will update 'bufferStart' and 'bufferEnd' + curByte = readByte(); + bitsLeft = 8; } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); } -} - -void RleDecoderV2::unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 2; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint32_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; + void RleDecoderV2::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = bufferEnd - bufferStart; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + data[curIdx++] = *buffer++; } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } + // readByte() will update 'bufferStart' and 'bufferEnd'. + data[curIdx++] = readByte(); } + } - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements) - __m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0 - while (numElements >= 64) { - __m512i srcmm3 = _mm512_maskz_loadu_epi8(readMask, srcPtr); - __m512i srcmm0, srcmm1, srcmm2, tmpmm; - - srcmm2 = _mm512_srli_epi16(srcmm3, 2); - srcmm1 = _mm512_srli_epi16(srcmm3, 4); - srcmm0 = _mm512_srli_epi16(srcmm3, 6); - - // turn 2 bitWidth into 8 by zeroing 3 of each 4 elements. - // move them into their places - // srcmm0: a e i m 0 0 0 0 0 0 0 0 0 0 0 0 - // srcmm1: b f j n 0 0 0 0 0 0 0 0 0 0 0 0 - tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 00 00 00 00 - srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // ij mn 00 00 00 00 00 00 - srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x00); // ab ef ab ef ij mn ij mn - - // srcmm2: c g k o 0 0 0 0 0 0 0 0 0 0 0 0 - // srcmm3: d h l p 0 0 0 0 0 0 0 0 0 0 0 0 - tmpmm = _mm512_unpacklo_epi8(srcmm2, srcmm3); // cd gh 00 00 00 00 00 00 - srcmm1 = _mm512_unpackhi_epi8(srcmm2, srcmm3); // kl op 00 00 00 00 00 00 - srcmm1 = _mm512_shuffle_i64x2(tmpmm, srcmm1, 0x00); // cd gh cd gh kl op kl op - - tmpmm = _mm512_unpacklo_epi16(srcmm0, srcmm1); // abcd abcd ijkl ijkl - srcmm0 = _mm512_unpackhi_epi16(srcmm0, srcmm1); // efgh efgh mnop mnop - srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x88); // abcd ijkl efgh mnop - srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // abcd efgh ijkl mnop - - srcmm0 = _mm512_and_si512(srcmm0, parse_mask); - - _mm512_storeu_si512(vectorBuf8, srcmm0); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; + void RleDecoderV2::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 2; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint16_t b0, b1; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + buffer += 2; + data[curIdx++] = (b0 << 8) | b1; } - } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + data[curIdx++] = (b0 << 8) | b1; } + } - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; + void RleDecoderV2::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 3; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint32_t b0, b1, b2; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + buffer += 3; + data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); + } + bufferStart += bufferNum * 3; + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); } + } - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + void RleDecoderV2::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 4; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint32_t b0, b1, b2, b3; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + buffer += 4; + data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); } -} - -void RleDecoderV2::unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 3; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint32_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + void RleDecoderV2::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 5; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + buffer += 5; + data[curIdx++] = + static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + b4 = readByte(); + data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); } + } - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; + void RleDecoderV2::unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 6; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + buffer += 6; + data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | + (b4 << 8) | b5); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + b4 = readByte(); + b5 = readByte(); + data[curIdx++] = + static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); + } + } + + void RleDecoderV2::unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 7; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + b6 = static_cast(*(buffer + 6)); + buffer += 7; + data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | + (b4 << 16) | (b5 << 8) | b6); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + b4 = readByte(); + b5 = readByte(); + b6 = readByte(); + data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | + (b4 << 16) | (b5 << 8) | b6); + } + } + + void RleDecoderV2::unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (bufferEnd - bufferStart) / 8; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6, b7; + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + b6 = static_cast(*(buffer + 6)); + b7 = static_cast(*(buffer + 7)); + buffer += 8; + data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | + (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } + bufferStart = reinterpret_cast(buffer); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = readByte(); + b1 = readByte(); + b2 = readByte(); + b3 = readByte(); + b4 = readByte(); + b5 = readByte(); + b6 = readByte(); + b7 = readByte(); + data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | + (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } + } + + void RleDecoderV2::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + for (uint64_t i = offset; i < (offset + len); i++) { + uint64_t result = 0; + uint64_t bitsLeftToRead = fbs; + while (bitsLeftToRead > bitsLeft) { + result <<= bitsLeft; + result |= curByte & ((1 << bitsLeft) - 1); + bitsLeftToRead -= bitsLeft; + curByte = readByte(); + bitsLeft = 8; + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + bitsLeft -= static_cast(bitsLeftToRead); + result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + } + data[i] = static_cast(result); + } + } + + RleDecoderV2::RleDecoderV2(std::unique_ptr input, bool _isSigned, + MemoryPool& pool, ReaderMetrics* _metrics) + : RleDecoder(_metrics), + inputStream(std::move(input)), + isSigned(_isSigned), + firstByte(0), + runLength(0), + runRead(0), + bufferStart(nullptr), + bufferEnd(bufferStart), + bitsLeft(0), + curByte(0), + unpackedPatch(pool, 0), + literals(pool, MAX_LITERAL_SIZE) { + // PASS + } + + void RleDecoderV2::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // clear state + bufferEnd = bufferStart = nullptr; + runRead = runLength = 0; + // skip ahead the given number of records + skip(location.next()); + } + + void RleDecoderV2::skip(uint64_t numValues) { + // simple for now, until perf tests indicate something encoding specific is + // needed + const uint64_t N = 64; + int64_t dummy[N]; + + while (numValues) { + uint64_t nRead = std::min(N, numValues); + next(dummy, nRead, nullptr); + numValues -= nRead; + } + } + + template + void RleDecoderV2::next(T* const data, const uint64_t numValues, const char* const notNull) { + SCOPED_STOPWATCH(metrics, DecodingLatencyUs, DecodingCall); + uint64_t nRead = 0; + + while (nRead < numValues) { + // Skip any nulls before attempting to read first byte. + while (notNull && !notNull[nRead]) { + if (++nRead == numValues) { + return; // ended with null values + } } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; + if (runRead == runLength) { + resetRun(); + firstByte = readByte(); } - } - - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable3u); + uint64_t offset = nRead, length = numValues - nRead; - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable3u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable3u_1); - - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable3u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable3u_1); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask); - - _mm512_storeu_si512(vectorBuf8, zmm[0]); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; + EncodingType enc = static_cast((firstByte >> 6) & 0x03); + switch (static_cast(enc)) { + case SHORT_REPEAT: + nRead += nextShortRepeats(data, offset, length, notNull); + break; + case DIRECT: + nRead += nextDirect(data, offset, length, notNull); + break; + case PATCHED_BASE: + nRead += nextPatched(data, offset, length, notNull); + break; + case DELTA: + nRead += nextDelta(data, offset, length, notNull); + break; + default: + throw ParseError("unknown encoding"); } } + } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } + void RleDecoderV2::next(int64_t* data, uint64_t numValues, const char* notNull) { + next(data, numValues, notNull); + } - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + void RleDecoderV2::next(int32_t* data, uint64_t numValues, const char* notNull) { + next(data, numValues, notNull); } -} -void RleDecoderV2::unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len){ - uint32_t bitWidth = 4; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; + void RleDecoderV2::next(int16_t* data, uint64_t numValues, const char* notNull) { + next(data, numValues, notNull); + } - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } + template + uint64_t RleDecoderV2::nextShortRepeats(T* const data, uint64_t offset, uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bytes + uint64_t byteSize = (firstByte >> 3) & 0x07; + byteSize += 1; - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } + runLength = firstByte & 0x07; + // run lengths values are stored only after MIN_REPEAT value is met + runLength += MIN_REPEAT; + runRead = 0; - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } + // read the repeated value which is store using fixed bytes + literals[0] = readLongBE(byteSize); - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; + if (isSigned) { + literals[0] = unZigZag(static_cast(literals[0])); } } - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements) - __m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0 - while (numElements >= 64) { - __m512i srcmm0, srcmm1, tmpmm; - - srcmm1 = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm0 = _mm512_srli_epi16(srcmm1, 4); - - // move elements into their places - // srcmm0: a c e g 0 0 0 0 - // srcmm1: b d f h 0 0 0 0 - tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 - srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // cd gh 00 00 - srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x44); // ab ef cd gh - srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // ab cd ef gh - - // turn 4 bitWidth into 8 by zeroing 4 of each 8 bits. - srcmm0 = _mm512_and_si512(srcmm0, parseMask); - - _mm512_storeu_si512(vectorBuf8, srcmm0); + uint64_t nRead = std::min(runLength - runRead, numValues); - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + if (notNull) { + for (uint64_t pos = offset; pos < offset + nRead; ++pos) { + if (notNull[pos]) { + data[pos] = static_cast(literals[0]); + ++runRead; + } } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len){ - uint32_t bitWidth = 5; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; + for (uint64_t pos = offset; pos < offset + nRead; ++pos) { + data[pos] = static_cast(literals[0]); + ++runRead; } } - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } + return nRead; + } - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; + template + uint64_t RleDecoderV2::nextDirect(T* const data, uint64_t offset, uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + uint32_t bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + + readLongs(literals.data(), 0, runLength, bitSize); + if (isSigned) { + for (uint64_t i = 0; i < runLength; ++i) { + literals[i] = unZigZag(static_cast(literals[i])); + } } } - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable5u); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable5u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable5u_1); + return copyDataFromBuffer(data, offset, numValues, notNull); + } - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable5u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable5u_1); + void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap, + int64_t* resPatch, uint64_t* patchIdx) { + uint64_t idx = *patchIdx; + uint64_t gap = static_cast(unpackedPatch[idx]) >> patchBitSize; + int64_t patch = unpackedPatch[idx] & patchMask; + int64_t actualGap = 0; - while (numElements >= 64) { - __m512i srcmm, zmm[2]; + // special case: gap is >255 then patch value will be 0. + // if gap is <=255 then patch value cannot be 0 + while (gap == 255 && patch == 0) { + actualGap += 255; + ++idx; + gap = static_cast(unpackedPatch[idx]) >> patchBitSize; + patch = unpackedPatch[idx] & patchMask; + } + // add the left over gap + actualGap += gap; - srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + *resGap = actualGap; + *resPatch = patch; + *patchIdx = idx; + } - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + template + uint64_t RleDecoderV2::nextPatched(T* const data, uint64_t offset, uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + uint32_t bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + + // extract the number of bytes occupied by base + uint64_t thirdByte = readByte(); + uint64_t byteSize = (thirdByte >> 5) & 0x07; + // base width is one off + byteSize += 1; + + // extract patch width + uint32_t pwo = thirdByte & 0x1f; + uint32_t patchBitSize = decodeBitWidth(pwo); + + // read fourth byte and extract patch gap width + uint64_t fourthByte = readByte(); + uint32_t pgw = (fourthByte >> 5) & 0x07; + // patch gap width is one off + pgw += 1; + + // extract the length of the patch list + size_t pl = fourthByte & 0x1f; + if (pl == 0) { + throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); + } + + // read the next base width number of bytes to extract base value + int64_t base = readLongBE(byteSize); + int64_t mask = (static_cast(1) << ((byteSize * 8) - 1)); + // if mask of base value is 1 then base is negative value else positive + if ((base & mask) != 0) { + base = base & ~mask; + base = -base; + } + + readLongs(literals.data(), 0, runLength, bitSize); + // any remaining bits are thrown out + resetReadLongs(); + + // TODO: something more efficient than resize + unpackedPatch.resize(pl); + // TODO: Skip corrupt? + // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { + if ((patchBitSize + pgw) > 64) { + throw ParseError( + "Corrupt PATCHED_BASE encoded data " + "(patchBitSize + pgw > 64)!"); + } + uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); + readLongs(unpackedPatch.data(), 0, pl, cfb); + // any remaining bits are thrown out + resetReadLongs(); + + // apply the patch directly when decoding the packed data + int64_t patchMask = ((static_cast(1) << patchBitSize) - 1); + + int64_t gap = 0; + int64_t patch = 0; + uint64_t patchIdx = 0; + adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + for (uint64_t i = 0; i < runLength; ++i) { + if (static_cast(i) != gap) { + // no patching required. add base to unpacked value to get final value + literals[i] += base; + } else { + // extract the patch value + int64_t patchedVal = literals[i] | (patch << bitSize); - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask); + // add base to patched value + literals[i] = base + patchedVal; - _mm512_storeu_si512(vectorBuf8, zmm[0]); + // increment the patch to point to next entry in patch list + ++patchIdx; - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } + if (patchIdx < unpackedPatch.size()) { + adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + // next gap is relative to the current gap + gap += i; + } + } } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); } - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + return copyDataFromBuffer(data, offset, numValues, notNull); } -} - -void RleDecoderV2::unrolledUnpackVector6(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 6; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; + template + uint64_t RleDecoderV2::nextDelta(T* const data, uint64_t offset, uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + uint32_t bitSize; + if (fbo != 0) { + bitSize = decodeBitWidth(fbo); } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; + bitSize = 0; } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable6u); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable6u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable6u_1); - - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable6u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable6u_1); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask); - _mm512_storeu_si512(vectorBuf8, zmm[0]); + // extract the run length + runLength = static_cast(firstByte & 0x01) << 8; + runLength |= readByte(); + ++runLength; // account for first value + runRead = 0; - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + int64_t prevValue; + // read the first value stored as vint + if (isSigned) { + prevValue = readVslong(); } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + prevValue = static_cast(readVulong()); } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} -void RleDecoderV2::unrolledUnpackVector7(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 7; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; + literals[0] = prevValue; - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH , ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } + // read the fixed delta value stored as vint (deltas can be negative even + // if all number are positive) + int64_t deltaBase = readVslong(); - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; + if (bitSize == 0) { + // add fixed deltas to adjacent values + for (uint64_t i = 1; i < runLength; ++i) { + literals[i] = literals[i - 1] + deltaBase; + } } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; + prevValue = literals[1] = prevValue + deltaBase; + if (runLength < 2) { + std::stringstream ss; + ss << "Illegal run length for delta encoding: " << runLength; + throw ParseError(ss.str()); + } + // write the unpacked values, add it to previous value and store final + // value to result buffer. if the delta base value is negative then it + // is a decreasing sequence else an increasing sequence. + // read deltas using the literals buffer. + readLongs(literals.data(), 2, runLength - 2, bitSize); + if (deltaBase < 0) { + for (uint64_t i = 2; i < runLength; ++i) { + prevValue = literals[i] = prevValue - literals[i]; + } + } else { + for (uint64_t i = 2; i < runLength; ++i) { + prevValue = literals[i] = prevValue + literals[i]; + } + } } } - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } + return copyDataFromBuffer(data, offset, numValues, notNull); + } - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable7u); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable7u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable7u_1); - - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable7u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable7u_1); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask); - - _mm512_storeu_si512(vectorBuf8, zmm[0]); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector9(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 9; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable9u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable9u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable9u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable9u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable9u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable9u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable9u); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - if (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi16(zmm[0], 7); - - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector10(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 10; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable10u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable10u); - __m512i shiftMask = _mm512_load_si512(shiftTable10u); - - while (numElements >= 32) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm = _mm512_srlv_epi16(zmm, shiftMask); - zmm = _mm512_and_si512(zmm, parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector11(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 11; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable11u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable11u_1); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable11u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable11u_1); - - __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable11u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable11u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable11u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable11u_3); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable11u); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - if (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi16(zmm[0], 5); - - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector12(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 12; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable12u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable12u); - __m512i shiftMask = _mm512_load_si512(shiftTable12u); - - while (numElements >= 32) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm = _mm512_srlv_epi16(zmm, shiftMask); - zmm = _mm512_and_si512(zmm, parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector13(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 13; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable13u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable13u_1); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable13u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable13u_1); - - __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable13u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable13u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable13u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable13u_3); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable13u); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - if (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi16(zmm[0], 3); - - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector14(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 14; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable14u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable14u_1); - - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable14u); - - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable14u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable14u_1); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector15(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 15; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable15u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable15u_1); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable15u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable15u_1); - - __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable15u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable15u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable15u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable15u_3); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable15u); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - if (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi16(zmm[0], 1); - - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector16(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 16; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = len; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - int64_t* dstPtr = data + offset; - bool resetBuf = false; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - } else { - numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (numElements >= 32) { - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); - while (numElements >= 32) { - __m512i srcmm = _mm512_loadu_si512(srcPtr); - srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u); - _mm512_storeu_si512(vectorBuf16, srcmm); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - unrolledUnpack16(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; - unrolledUnpack16(dstPtr, 0, 1); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector17(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 17; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable17u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable17u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable17u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable17u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable17u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable17u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable17u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 15); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector18(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 18; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable18u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable18u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable18u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable18u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable18u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable18u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable18u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 14); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector19(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 19; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable19u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable19u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable19u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable19u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable19u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable19u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable19u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 13); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector20(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 20; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0u) { - uint32_t align = getAlign(startBit, bitWidth, 32u); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16u) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable20u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable20u); - __m512i shiftMask = _mm512_load_si512(shiftTable20u); - - while (numElements >= 16u) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm = _mm512_srlv_epi32(zmm, shiftMask); - zmm = _mm512_and_si512(zmm, parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector21(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 21; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0u) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable21u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable21u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable21u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable21u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable21u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable21u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable21u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 11); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector22(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 22; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable22u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable22u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable22u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable22u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable22u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable22u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable22u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 10); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector23(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 23; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable23u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable23u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable23u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable23u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable23u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable23u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable23u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 9); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector24(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 24; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - } else { - numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - - __m512i shuffleIdx = _mm512_load_si512(shuffleIdxTable24u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable24u); - - while (numElements >= 16) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdx); - - _mm512_storeu_si512(vectorBuf32, zmm); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - unrolledUnpack24(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; - unrolledUnpack24(dstPtr, 0, 1); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector26(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 26; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable26u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable26u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable26u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable26u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable26u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable26u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable26u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 6); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector28(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 28; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable28u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable28u); - __m512i shiftMask = _mm512_load_si512(shiftTable28u); - - while (numElements >= 16) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm = _mm512_srlv_epi32(zmm, shiftMask); - zmm = _mm512_and_si512(zmm, parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector30(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 30; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable30u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable30u_1); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable30u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable30u_1); - - __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable30u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable30u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable30u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable30u_3); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable30u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[2]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[3]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 2u); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4u); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} - -void RleDecoderV2::unrolledUnpackVector32(int64_t *data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 32; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - } else { - numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (numElements >= 16) { - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - while (numElements >= 16) { - __m512i srcmm = _mm512_loadu_si512(srcPtr); - srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u); - _mm512_storeu_si512(vectorBuf32, srcmm); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - unrolledUnpack32(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; - unrolledUnpack32(dstPtr, 0, 1); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } -} -#endif - -void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. - while (bitsLeft > 0 && curIdx < offset + len) { - bitsLeft -= 4; - data[curIdx++] = (curByte >> bitsLeft) & 15; - } - if (curIdx == offset + len) return; - - // Exhaust the buffer - uint64_t numGroups = (offset + len - curIdx) / 2; - numGroups = std::min(numGroups, static_cast(bufferEnd - bufferStart)); - // Avoid updating 'bufferStart' inside the loop. - const auto *buffer = reinterpret_cast(bufferStart); - uint32_t localByte; - for (uint64_t i = 0; i < numGroups; ++i) { - localByte = *buffer++; - data[curIdx] = (localByte >> 4) & 15; - data[curIdx + 1] = localByte & 15; - curIdx += 2; - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // readByte() will update 'bufferStart' and 'bufferEnd' - curByte = readByte(); - bitsLeft = 8; - } -} - -void RleDecoderV2::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = bufferEnd - bufferStart; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - data[curIdx++] = *buffer++; - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // readByte() will update 'bufferStart' and 'bufferEnd'. - data[curIdx++] = readByte(); - } -} - -void RleDecoderV2::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 2; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint16_t b0, b1; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - buffer += 2; - data[curIdx++] = (b0 << 8) | b1; - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - data[curIdx++] = (b0 << 8) | b1; - } -} - -void RleDecoderV2::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 3; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint32_t b0, b1, b2; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - buffer += 3; - data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); - } - bufferStart += bufferNum * 3; - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); - } -} - -void RleDecoderV2::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 4; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint32_t b0, b1, b2, b3; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - buffer += 4; - data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); - } -} - -void RleDecoderV2::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 5; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - buffer += 5; - data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - } -} - -void RleDecoderV2::unrolledUnpack48(int64_t *data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 6; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - buffer += 6; - data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); - } -} - -void RleDecoderV2::unrolledUnpack56(int64_t *data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 7; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - b6 = static_cast(*(buffer + 6)); - buffer += 7; - data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); - } -} - -void RleDecoderV2::unrolledUnpack64(int64_t *data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 8; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6, b7; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - b6 = static_cast(*(buffer + 6)); - b7 = static_cast(*(buffer + 7)); - buffer += 8; - data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - b7 = readByte(); - data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); - } -} - -void RleDecoderV2::plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, - uint64_t fbs, uint64_t& startBit) { - for (uint64_t i = offset; i < (offset + len); i++) { - uint64_t result = 0; - uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = readByte(); - bitsLeft = 8; - } - - // handle the left over bits - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= static_cast(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); - } - data[i] = static_cast(result); - startBit = bitsLeft == 0 ? 0 : (8 - bitsLeft); - } -} - -RleDecoderV2::RleDecoderV2(std::unique_ptr input, - bool _isSigned, MemoryPool& pool, - ReaderMetrics* _metrics - ): RleDecoder(_metrics), - inputStream(std::move(input)), - isSigned(_isSigned), - firstByte(0), - runLength(0), - runRead(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - bitsLeft(0), - curByte(0), - unpackedPatch(pool, 0), - literals(pool, MAX_LITERAL_SIZE) { - // PASS -} - -void RleDecoderV2::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // clear state - bufferEnd = bufferStart = nullptr; - runRead = runLength = 0; - // skip ahead the given number of records - skip(location.next()); -} - -void RleDecoderV2::skip(uint64_t numValues) { - // simple for now, until perf tests indicate something encoding specific is - // needed - const uint64_t N = 64; - int64_t dummy[N]; - - while (numValues) { - uint64_t nRead = std::min(N, numValues); - next(dummy, nRead, nullptr); - numValues -= nRead; - } -} - -void RleDecoderV2::next(int64_t* const data, - const uint64_t numValues, - const char* const notNull) { - SCOPED_STOPWATCH(metrics, DecodingLatencyUs, DecodingCall); - uint64_t nRead = 0; - - while (nRead < numValues) { - // Skip any nulls before attempting to read first byte. - while (notNull && !notNull[nRead]) { - if (++nRead == numValues) { - return; // ended with null values - } - } - - if (runRead == runLength) { - resetRun(); - firstByte = readByte(); - } - - uint64_t offset = nRead, length = numValues - nRead; - - EncodingType enc = static_cast - ((firstByte >> 6) & 0x03); - switch(static_cast(enc)) { - case SHORT_REPEAT: - nRead += nextShortRepeats(data, offset, length, notNull); - break; - case DIRECT: - nRead += nextDirect(data, offset, length, notNull); - break; - case PATCHED_BASE: - nRead += nextPatched(data, offset, length, notNull); - break; - case DELTA: - nRead += nextDelta(data, offset, length, notNull); - break; - default: - throw ParseError("unknown encoding"); - } - } -} - -uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bytes - uint64_t byteSize = (firstByte >> 3) & 0x07; - byteSize += 1; - - runLength = firstByte & 0x07; - // run lengths values are stored only after MIN_REPEAT value is met - runLength += MIN_REPEAT; - runRead = 0; - - // read the repeated value which is store using fixed bytes - literals[0] = readLongBE(byteSize); - - if (isSigned) { - literals[0] = unZigZag(static_cast(literals[0])); - } - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - if (notNull) { - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - if (notNull[pos]) { - data[pos] = literals[0]; - ++runRead; - } - } - } else { - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - data[pos] = literals[0]; - ++runRead; - } - } - - return nRead; -} - -uint64_t RleDecoderV2::nextDirect(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - uint32_t bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - - readLongs(literals.data(), 0, runLength, bitSize); - if (isSigned) { - for (uint64_t i = 0; i < runLength; ++i) { - literals[i] = unZigZag(static_cast(literals[i])); - } - } - } - - return copyDataFromBuffer(data, offset, numValues, notNull); -} - -void RleDecoderV2::adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, - int64_t* resGap, int64_t* resPatch, - uint64_t* patchIdx) { - uint64_t idx = *patchIdx; - uint64_t gap = static_cast(unpackedPatch[idx]) >> patchBitSize; - int64_t patch = unpackedPatch[idx] & patchMask; - int64_t actualGap = 0; - - // special case: gap is >255 then patch value will be 0. - // if gap is <=255 then patch value cannot be 0 - while (gap == 255 && patch == 0) { - actualGap += 255; - ++idx; - gap = static_cast(unpackedPatch[idx]) >> patchBitSize; - patch = unpackedPatch[idx] & patchMask; - } - // add the left over gap - actualGap += gap; - - *resGap = actualGap; - *resPatch = patch; - *patchIdx = idx; -} - -uint64_t RleDecoderV2::nextPatched(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - uint32_t bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - - // extract the number of bytes occupied by base - uint64_t thirdByte = readByte(); - uint64_t byteSize = (thirdByte >> 5) & 0x07; - // base width is one off - byteSize += 1; - - // extract patch width - uint32_t pwo = thirdByte & 0x1f; - uint32_t patchBitSize = decodeBitWidth(pwo); - - // read fourth byte and extract patch gap width - uint64_t fourthByte = readByte(); - uint32_t pgw = (fourthByte >> 5) & 0x07; - // patch gap width is one off - pgw += 1; - - // extract the length of the patch list - size_t pl = fourthByte & 0x1f; - if (pl == 0) { - throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); - } - - // read the next base width number of bytes to extract base value - int64_t base = readLongBE(byteSize); - int64_t mask = (static_cast(1) << ((byteSize * 8) - 1)); - // if mask of base value is 1 then base is negative value else positive - if ((base & mask) != 0) { - base = base & ~mask; - base = -base; - } - - readLongs(literals.data(), 0, runLength, bitSize); - // any remaining bits are thrown out - resetReadLongs(); - - // TODO: something more efficient than resize - unpackedPatch.resize(pl); - // TODO: Skip corrupt? - // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { - if ((patchBitSize + pgw) > 64) { - throw ParseError("Corrupt PATCHED_BASE encoded data " - "(patchBitSize + pgw > 64)!"); - } - uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); - readLongs(unpackedPatch.data(), 0, pl, cfb); - // any remaining bits are thrown out - resetReadLongs(); - - // apply the patch directly when decoding the packed data - int64_t patchMask = ((static_cast(1) << patchBitSize) - 1); - - int64_t gap = 0; - int64_t patch = 0; - uint64_t patchIdx = 0; - adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, &patchIdx); - - for (uint64_t i = 0; i < runLength; ++i) { - if (static_cast(i) != gap) { - // no patching required. add base to unpacked value to get final value - literals[i] += base; - } else { - // extract the patch value - int64_t patchedVal = literals[i] | (patch << bitSize); - - // add base to patched value - literals[i] = base + patchedVal; - - // increment the patch to point to next entry in patch list - ++patchIdx; - - if (patchIdx < unpackedPatch.size()) { - adjustGapAndPatch(patchBitSize, patchMask, &gap, &patch, - &patchIdx); - - // next gap is relative to the current gap - gap += i; - } - } - } - } - - return copyDataFromBuffer(data, offset, numValues, notNull); -} - -uint64_t RleDecoderV2::nextDelta(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - uint32_t bitSize; - if (fbo != 0) { - bitSize = decodeBitWidth(fbo); - } else { - bitSize = 0; - } - - // extract the run length - runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); - ++runLength; // account for first value - runRead = 0; - - int64_t prevValue; - // read the first value stored as vint - if (isSigned) { - prevValue = readVslong(); - } else { - prevValue = static_cast(readVulong()); - } - - literals[0] = prevValue; - - // read the fixed delta value stored as vint (deltas can be negative even - // if all number are positive) - int64_t deltaBase = readVslong(); - - if (bitSize == 0) { - // add fixed deltas to adjacent values - for (uint64_t i = 1; i < runLength; ++i) { - literals[i] = literals[i - 1] + deltaBase; + template + uint64_t RleDecoderV2::copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, + const char* notNull) { + uint64_t nRead = std::min(runLength - runRead, numValues); + if (notNull) { + for (uint64_t i = offset; i < (offset + nRead); ++i) { + if (notNull[i]) { + data[i] = static_cast(literals[runRead++]); + } } } else { - prevValue = literals[1] = prevValue + deltaBase; - if (runLength < 2) { - std::stringstream ss; - ss << "Illegal run length for delta encoding: " << runLength; - throw ParseError(ss.str()); - } - // write the unpacked values, add it to previous value and store final - // value to result buffer. if the delta base value is negative then it - // is a decreasing sequence else an increasing sequence. - // read deltas using the literals buffer. - readLongs(literals.data(), 2, runLength - 2, bitSize); - if (deltaBase < 0) { - for (uint64_t i = 2; i < runLength; ++i) { - prevValue = literals[i] = prevValue - literals[i]; - } - } else { - for (uint64_t i = 2; i < runLength; ++i) { - prevValue = literals[i] = prevValue + literals[i]; - } - } - } - } - - return copyDataFromBuffer(data, offset, numValues, notNull); -} - -uint64_t RleDecoderV2::copyDataFromBuffer(int64_t* data, uint64_t offset, - uint64_t numValues, const char* notNull) { - uint64_t nRead = std::min(runLength - runRead, numValues); - if (notNull) { - for (uint64_t i = offset; i < (offset + nRead); ++i) { - if (notNull[i]) { - data[i] = literals[runRead++]; + for (uint64_t i = offset; i < (offset + nRead); ++i) { + data[i] = static_cast(literals[runRead++]); } } - } else { - memcpy(data + offset, literals.data() + runRead, nRead * sizeof(int64_t)); - runRead += nRead; + return nRead; } - return nRead; -} } // namespace orc From 293d863af508feffd4742099bd33d205ed40c76a Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Tue, 10 Jan 2023 14:58:59 +0800 Subject: [PATCH 03/80] Fix some conflicts. --- c++/src/RleDecoderV2.cc | 4165 ++++++++++++++++++++++++++++++++++++++- c++/test/CMakeLists.txt | 6 +- 2 files changed, 4167 insertions(+), 4 deletions(-) diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 2742aef6f6..78783f4a72 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -21,8 +21,32 @@ #include "RLEV2Util.hh" #include "RLEv2.hh" #include "Utils.hh" +#include "VectorDecoder.hh" +#include "DetectPlatform.hh" namespace orc { + void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { + uint64_t restLen = bufferEnd - bufferStart; + int bufferLength = 0; + const void* bufferPointer = nullptr; + + if (backupByteLen != 0) { + inputStream->BackUp(backupByteLen); + } + + if (len >= restLen && resetBuf == true) { + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::resetBufferStart"); + } + } + + if (bufferPointer == nullptr) { + bufferStart += len; + } else { + bufferStart = static_cast(bufferPointer); + bufferEnd = bufferStart + bufferLength; + } + } unsigned char RleDecoderV2::readByte() { SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); @@ -67,6 +91,147 @@ namespace orc { } void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + uint64_t startBit = 0; +#if ENABLE_AVX512 + if (detect_platform() == arch_t::avx512_arch) { + switch (fbs) { + case 1: + unrolledUnpackVector1(data, offset, len); + return; + case 2: + unrolledUnpackVector2(data, offset, len); + return; + case 3: + unrolledUnpackVector3(data, offset, len); + return; + case 4: + unrolledUnpackVector4(data, offset, len); + return; + case 5: + unrolledUnpackVector5(data, offset, len); + return; + case 6: + unrolledUnpackVector6(data, offset, len); + return; + case 7: + unrolledUnpackVector7(data, offset, len); + return; + case 8: + unrolledUnpack8(data, offset, len); + return; + case 9: + unrolledUnpackVector9(data, offset, len); + return; + case 10: + unrolledUnpackVector10(data, offset, len); + return; + case 11: + unrolledUnpackVector11(data, offset, len); + return; + case 12: + unrolledUnpackVector12(data, offset, len); + return; + case 13: + unrolledUnpackVector13(data, offset, len); + return; + case 14: + unrolledUnpackVector14(data, offset, len); + return; + case 15: + unrolledUnpackVector15(data, offset, len); + return; + case 16: + unrolledUnpackVector16(data, offset, len); + return; + case 17: + unrolledUnpackVector17(data, offset, len); + return; + case 18: + unrolledUnpackVector18(data, offset, len); + return; + case 19: + unrolledUnpackVector19(data, offset, len); + return; + case 20: + unrolledUnpackVector20(data, offset, len); + return; + case 21: + unrolledUnpackVector21(data, offset, len); + return; + case 22: + unrolledUnpackVector22(data, offset, len); + return; + case 23: + unrolledUnpackVector23(data, offset, len); + return; + case 24: + unrolledUnpackVector24(data, offset, len); + return; + case 26: + unrolledUnpackVector26(data, offset, len); + return; + case 28: + unrolledUnpackVector28(data, offset, len); + return; + case 30: + unrolledUnpackVector30(data, offset, len); + return; + case 32: + unrolledUnpack32(data, offset, len); + return; + case 40: + unrolledUnpack40(data, offset, len); + return; + case 48: + unrolledUnpack48(data, offset, len); + return; + case 56: + unrolledUnpack56(data, offset, len); + return; + case 64: + unrolledUnpack64(data, offset, len); + return; + default: + // Fallback to the default implementation for deprecated bit size. + plainUnpackLongs(data, offset, len, fbs, startBit); + return; + } + } else { + switch (fbs) { + case 4: + unrolledUnpack4(data, offset, len); + return; + case 8: + unrolledUnpack8(data, offset, len); + return; + case 16: + unrolledUnpack16(data, offset, len); + return; + case 24: + unrolledUnpack24(data, offset, len); + return; + case 32: + unrolledUnpack32(data, offset, len); + return; + case 40: + unrolledUnpack40(data, offset, len); + return; + case 48: + unrolledUnpack48(data, offset, len); + return; + case 56: + unrolledUnpack56(data, offset, len); + return; + case 64: + unrolledUnpack64(data, offset, len); + return; + default: + // Fallback to the default implementation for deprecated bit size. + plainUnpackLongs(data, offset, len, fbs, startBit); + return; + } + } +#else switch (fbs) { case 4: unrolledUnpack4(data, offset, len); @@ -97,10 +262,4004 @@ namespace orc { return; default: // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs); + plainUnpackLongs(data, offset, len, fbs, startBit); + return; + } +#endif + } + +#if ENABLE_AVX512 + void RleDecoderV2::unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 1; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint32_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __m512i reverseMask1u = _mm512_load_si512(reverseMaskTable1u); + while (numElements >= 64) { + uint64_t src_64 = *(uint64_t *)srcPtr; + // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF + __m512i srcmm = _mm512_movm_epi8(src_64); + // make 0x00 --> 0x00, 0xFF --> 0x01 + srcmm = _mm512_abs_epi8(srcmm); + srcmm = _mm512_shuffle_epi8(srcmm, reverseMask1u); + _mm512_storeu_si512(vectorBuf8, srcmm); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 2; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint32_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements) + __m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0 + while (numElements >= 64) { + __m512i srcmm3 = _mm512_maskz_loadu_epi8(readMask, srcPtr); + __m512i srcmm0, srcmm1, srcmm2, tmpmm; + + srcmm2 = _mm512_srli_epi16(srcmm3, 2); + srcmm1 = _mm512_srli_epi16(srcmm3, 4); + srcmm0 = _mm512_srli_epi16(srcmm3, 6); + + // turn 2 bitWidth into 8 by zeroing 3 of each 4 elements. + // move them into their places + // srcmm0: a e i m 0 0 0 0 0 0 0 0 0 0 0 0 + // srcmm1: b f j n 0 0 0 0 0 0 0 0 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 00 00 00 00 + srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // ij mn 00 00 00 00 00 00 + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x00); // ab ef ab ef ij mn ij mn + + // srcmm2: c g k o 0 0 0 0 0 0 0 0 0 0 0 0 + // srcmm3: d h l p 0 0 0 0 0 0 0 0 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm2, srcmm3); // cd gh 00 00 00 00 00 00 + srcmm1 = _mm512_unpackhi_epi8(srcmm2, srcmm3); // kl op 00 00 00 00 00 00 + srcmm1 = _mm512_shuffle_i64x2(tmpmm, srcmm1, 0x00); // cd gh cd gh kl op kl op + + tmpmm = _mm512_unpacklo_epi16(srcmm0, srcmm1); // abcd abcd ijkl ijkl + srcmm0 = _mm512_unpackhi_epi16(srcmm0, srcmm1); // efgh efgh mnop mnop + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x88); // abcd ijkl efgh mnop + srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // abcd efgh ijkl mnop + + srcmm0 = _mm512_and_si512(srcmm0, parse_mask); + + _mm512_storeu_si512(vectorBuf8, srcmm0); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 3; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint32_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable3u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable3u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable3u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable3u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable3u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len){ + uint32_t bitWidth = 4; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements) + __m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0 + while (numElements >= 64) { + __m512i srcmm0, srcmm1, tmpmm; + + srcmm1 = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm0 = _mm512_srli_epi16(srcmm1, 4); + + // move elements into their places + // srcmm0: a c e g 0 0 0 0 + // srcmm1: b d f h 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 + srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // cd gh 00 00 + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x44); // ab ef cd gh + srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // ab cd ef gh + + // turn 4 bitWidth into 8 by zeroing 4 of each 8 bits. + srcmm0 = _mm512_and_si512(srcmm0, parseMask); + + _mm512_storeu_si512(vectorBuf8, srcmm0); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len){ + uint32_t bitWidth = 5; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable5u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable5u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable5u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable5u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable5u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector6(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 6; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable6u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable6u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable6u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable6u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable6u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector7(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 7; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH , ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable7u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable7u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable7u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable7u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable7u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector9(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 9; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable9u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable9u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable9u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable9u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable9u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable9u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable9u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 7); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector10(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 10; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable10u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable10u); + __m512i shiftMask = _mm512_load_si512(shiftTable10u); + + while (numElements >= 32) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi16(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector11(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 11; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable11u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable11u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable11u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable11u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable11u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable11u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable11u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable11u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable11u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 5); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector12(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 12; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable12u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable12u); + __m512i shiftMask = _mm512_load_si512(shiftTable12u); + + while (numElements >= 32) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi16(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector13(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 13; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable13u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable13u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable13u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable13u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable13u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable13u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable13u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable13u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable13u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 3); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector14(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 14; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable14u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable14u_1); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable14u); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable14u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable14u_1); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector15(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 15; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable15u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable15u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable15u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable15u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable15u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable15u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable15u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable15u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable15u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 1); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector16(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 16; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = len; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + int64_t* dstPtr = data + offset; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + } else { + numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (numElements >= 32) { + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + while (numElements >= 32) { + __m512i srcmm = _mm512_loadu_si512(srcPtr); + srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u); + _mm512_storeu_si512(vectorBuf16, srcmm); + + srcPtr += 4 * bitWidth; + resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + unrolledUnpack16(dstPtr, 0, numElements); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; + unrolledUnpack16(dstPtr, 0, 1); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector17(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 17; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable17u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable17u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable17u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable17u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable17u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable17u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable17u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 15); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector18(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 18; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable18u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable18u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable18u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable18u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable18u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable18u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable18u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 14); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector19(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 19; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable19u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable19u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable19u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable19u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable19u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable19u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable19u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 13); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector20(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 20; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0u) { + uint32_t align = getAlign(startBit, bitWidth, 32u); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16u) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable20u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable20u); + __m512i shiftMask = _mm512_load_si512(shiftTable20u); + + while (numElements >= 16u) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi32(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector21(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 21; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0u) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable21u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable21u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable21u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable21u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable21u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable21u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable21u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 11); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector22(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 22; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable22u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable22u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable22u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable22u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable22u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable22u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable22u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 10); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector23(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 23; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable23u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable23u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable23u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable23u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable23u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable23u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable23u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 9); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector24(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 24; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + } else { + numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + + __m512i shuffleIdx = _mm512_load_si512(shuffleIdxTable24u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable24u); + + while (numElements >= 16) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdx); + + _mm512_storeu_si512(vectorBuf32, zmm); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + unrolledUnpack24(dstPtr, 0, numElements); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; + unrolledUnpack24(dstPtr, 0, 1); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector26(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 26; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable26u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable26u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable26u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable26u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable26u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable26u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable26u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 6); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector28(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 28; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable28u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable28u); + __m512i shiftMask = _mm512_load_si512(shiftTable28u); + + while (numElements >= 16) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi32(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector30(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 30; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + bufRestByteLen = bufferEnd - bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable30u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable30u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable30u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable30u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable30u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable30u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable30u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable30u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable30u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 2u); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4u); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); + } + } + + void RleDecoderV2::unrolledUnpackVector32(int64_t *data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 32; + const uint8_t *srcPtr = reinterpret_cast(bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = bufferEnd - bufferStart; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + } else { + numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (numElements >= 16) { + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + while (numElements >= 16) { + __m512i srcmm = _mm512_loadu_si512(srcPtr); + srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u); + _mm512_storeu_si512(vectorBuf32, srcmm); + + srcPtr += 2 * bitWidth; + resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + unrolledUnpack32(dstPtr, 0, numElements); + srcPtr = reinterpret_cast(bufferStart); + dstPtr += numElements; + bufRestByteLen = bufferEnd - bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; + unrolledUnpack32(dstPtr, 0, 1); + dstPtr++; + backupByteLen = 0; + len--; + } else { + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + } + + bufRestByteLen = bufferEnd - bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(bufferStart); } } +#endif void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { uint64_t curIdx = offset; @@ -376,7 +4535,8 @@ namespace orc { } } - void RleDecoderV2::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + void RleDecoderV2::plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, + uint64_t fbs, uint64_t& startBit) { for (uint64_t i = offset; i < (offset + len); i++) { uint64_t result = 0; uint64_t bitsLeftToRead = fbs; @@ -395,6 +4555,7 @@ namespace orc { result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); } data[i] = static_cast(result); + startBit = bitsLeft == 0 ? 0 : (8 - bitsLeft); } } diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index 4cf55ddf80..b8ccd61a84 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -16,12 +16,13 @@ include_directories( ${PROJECT_BINARY_DIR}/c++/src ) -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX11_FLAGS} ${WARN_FLAGS}") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") add_executable (orc-test MemoryInputStream.cc MemoryOutputStream.cc TestAttributes.cc + TestBlockBuffer.cc TestBufferedOutputStream.cc TestBloomFilter.cc TestByteRle.cc @@ -58,6 +59,7 @@ target_link_libraries (orc-test orc::protobuf orc::snappy orc::zlib + orc::gtest orc::gmock ) @@ -88,4 +90,4 @@ if (WIN32) APPEND PROPERTY ENVIRONMENT "TZDIR=${TZDATA_DIR}" ) -endif () +endif () \ No newline at end of file From e7a91195693010e237c14ec0160b7d24b9da4a89 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Wed, 11 Jan 2023 07:59:19 +0530 Subject: [PATCH 04/80] Fix the code format. --- c++/src/RleDecoderV2.cc | 32 ++++++++++++++++---------------- c++/test/CMakeLists.txt | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 78783f4a72..54c4f0303f 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -1220,7 +1220,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -1403,7 +1403,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -1525,7 +1525,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -1717,7 +1717,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -1839,7 +1839,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -2031,7 +2031,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -2165,7 +2165,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -2432,7 +2432,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -2615,7 +2615,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -2798,7 +2798,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -2981,7 +2981,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -3103,7 +3103,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -3286,7 +3286,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; @@ -3449,12 +3449,12 @@ namespace orc { if (startBit != 0) { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); resetBuf = true; } } @@ -3470,7 +3470,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index b8ccd61a84..4fd0f70fdc 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -90,4 +90,4 @@ if (WIN32) APPEND PROPERTY ENVIRONMENT "TZDIR=${TZDATA_DIR}" ) -endif () \ No newline at end of file +endif () From cfde08f9f31573af7fdb434c47b34c1ee8089152 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Wed, 11 Jan 2023 08:44:23 +0530 Subject: [PATCH 05/80] Modify TestRleVectorDecoder.cc to match the new format. --- c++/test/TestRleVectorDecoder.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 05cc31cd31..a1f5ebd483 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -632,8 +632,8 @@ namespace orc { } printf("\n"); } -#endif - INSTANTIATE_TEST_CASE_P(OrcTest, RleVectorTest, Values(true,false)); + INSTANTIATE_TEST_SUITE_P(OrcTest, RleVectorTest, Values(true,false)); +#endif } From 83419439d4728291a625b9f399cc6c1392197aae Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Wed, 11 Jan 2023 11:12:28 +0530 Subject: [PATCH 06/80] Fix a mistake on function name --- c++/src/RleDecoderV2.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 54c4f0303f..cb39fa025b 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -177,7 +177,7 @@ namespace orc { unrolledUnpackVector30(data, offset, len); return; case 32: - unrolledUnpack32(data, offset, len); + unrolledUnpackVector32(data, offset, len); return; case 40: unrolledUnpack40(data, offset, len); From e840649fbef427fddff2ed78a5aff9a6c037ce02 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Wed, 11 Jan 2023 14:16:49 +0530 Subject: [PATCH 07/80] Modified code into namespace orc --- c++/src/DetectPlatform.hh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh index 4281a8c8a7..d47fb8bcc2 100644 --- a/c++/src/DetectPlatform.hh +++ b/c++/src/DetectPlatform.hh @@ -19,6 +19,8 @@ #ifndef ORC_DETECTPLATFORM_HH #define ORC_DETECTPLATFORM_HH +namespace orc +{ #ifdef _WIN32 #include "intrin.h" @@ -45,8 +47,6 @@ unsigned long long _xgetbv(unsigned int index) { #endif -namespace orc -{ #define CPUID_AVX512F 0x00100000 #define CPUID_AVX512CD 0x00200000 #define CPUID_AVX512VL 0x04000000 From c7962d5196357a43cbb64bdca83f9f3b1d74acd4 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Wed, 11 Jan 2023 14:41:29 +0530 Subject: [PATCH 08/80] Modify function name to fix a build issue. --- c++/src/DetectPlatform.hh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh index d47fb8bcc2..24a2117c25 100644 --- a/c++/src/DetectPlatform.hh +++ b/c++/src/DetectPlatform.hh @@ -35,7 +35,7 @@ void cpuid(int info[4], int InfoType) { __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); } -unsigned long long _xgetbv(unsigned int index) { +unsigned long long xgetbv(unsigned int index) { unsigned int eax, edx; __asm__ __volatile__( "xgetbv;" @@ -72,7 +72,7 @@ unsigned long long _xgetbv(unsigned int index) { if (avx512_support_cpu && os_uses_XSAVE_XSTORE) { // Check if XMM state and YMM state are saved - unsigned long long xcr_feature_mask = _xgetbv(0); + unsigned long long xcr_feature_mask = xgetbv(0); if ((xcr_feature_mask & 0x6) == 0x6) { // AVX2 is supported now if ((xcr_feature_mask & 0xe0) == 0xe0) { // AVX512 is supported now From 495a62040dd0dbade3002a9f7305d1e466d9392e Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Thu, 12 Jan 2023 06:42:44 +0530 Subject: [PATCH 09/80] Modify code format. --- c++/src/DetectPlatform.hh | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh index 24a2117c25..a04455c3a6 100644 --- a/c++/src/DetectPlatform.hh +++ b/c++/src/DetectPlatform.hh @@ -19,6 +19,10 @@ #ifndef ORC_DETECTPLATFORM_HH #define ORC_DETECTPLATFORM_HH +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wold-style-cast") +#endif + namespace orc { #ifdef _WIN32 @@ -31,11 +35,11 @@ namespace orc #include #include -void cpuid(int info[4], int InfoType) { - __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); -} + void cpuid(int info[4], int InfoType) { + __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); + } -unsigned long long xgetbv(unsigned int index) { + unsigned long long xgetbv(unsigned int index) { unsigned int eax, edx; __asm__ __volatile__( "xgetbv;" @@ -43,7 +47,7 @@ unsigned long long xgetbv(unsigned int index) { : "c" (index) ); return ((unsigned long long) edx << 32) | eax; -} + } #endif From 5c937e67d1b9a57b12742454ab857a32c255cccd Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Thu, 12 Jan 2023 08:43:59 +0530 Subject: [PATCH 10/80] Fix a build issue about int64 has different printf format between macos and linux. --- c++/test/TestRleVectorDecoder.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index a1f5ebd483..8fde0bb1dc 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -23,6 +23,7 @@ #include "wrap/orc-proto-wrapper.hh" #include "wrap/gtest-wrapper.h" +#include #ifdef __clang__ DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") @@ -129,7 +130,7 @@ namespace orc { int32_t lpad = offset * BARWIDTH / total; int32_t rpad = BARWIDTH - lpad; - printf("\r%s:%3d%% [%.*s%*s] [%ld/%ld]", testName, val, lpad, BARSTR, rpad, "", offset, total); + printf("\r%s:%3d%% [%.*s%*s] [%" PRId64 "/%" PRId64 "]", testName, val, lpad, BARSTR, rpad, "", offset, total); fflush(stdout); } From a87c2816ad58a25744562e11cbf4ec804c34ecfc Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Thu, 12 Jan 2023 11:39:30 +0530 Subject: [PATCH 11/80] Fix build issue on windows. --- c++/src/DetectPlatform.hh | 4 ++++ c++/test/TestRleVectorDecoder.cc | 1 + 2 files changed, 5 insertions(+) diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh index a04455c3a6..2c0e13ce4a 100644 --- a/c++/src/DetectPlatform.hh +++ b/c++/src/DetectPlatform.hh @@ -76,7 +76,11 @@ namespace orc if (avx512_support_cpu && os_uses_XSAVE_XSTORE) { // Check if XMM state and YMM state are saved +#ifdef _WIN32 + unsigned long long xcr_feature_mask = _xgetbv(0); /* min VS2010 SP1 compiler is required */ +#else unsigned long long xcr_feature_mask = xgetbv(0); +#endif if ((xcr_feature_mask & 0x6) == 0x6) { // AVX2 is supported now if ((xcr_feature_mask & 0xe0) == 0xe0) { // AVX512 is supported now diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 8fde0bb1dc..f52690db57 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -27,6 +27,7 @@ #ifdef __clang__ DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") + DIAGNOSTIC_IGNORE("-Wclang-format-violations") #endif namespace orc { From d8fcbe6832bfadaeb9e8d090e1dd73d8e37efcdc Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Thu, 12 Jan 2023 13:14:22 +0530 Subject: [PATCH 12/80] Fix some code format issue and function name. --- CMakeLists.txt | 4 ++-- c++/src/DetectPlatform.hh | 10 +++++----- c++/src/RleDecoderV2.cc | 2 +- c++/src/VectorDecoder.hh | 4 ++-- c++/test/TestRleVectorDecoder.cc | 10 +++++----- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b936062b19..35fd4ab038 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,7 +67,7 @@ option(BUILD_CPP_ENABLE_METRICS "Enable the metrics collection at compile phase" OFF) -option(ENABLE_AVX512_BIT_PACKING +option(BUILD_ENABLE_AVX512 "Enable AVX512 vector decode of bit-packing" OFF) @@ -169,7 +169,7 @@ else () add_compile_definitions(ENABLE_METRICS=0) endif () -if (ENABLE_AVX512_BIT_PACKING) +if (BUILD_ENABLE_AVX512) message(STATUS "Enable the AVX512 vector decode of bit-packing") add_compile_definitions(ENABLE_AVX512=1) else () diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh index 2c0e13ce4a..bbec1402f2 100644 --- a/c++/src/DetectPlatform.hh +++ b/c++/src/DetectPlatform.hh @@ -66,13 +66,13 @@ namespace orc avx512_arch = 2 }; - arch_t detect_platform() { + arch_t detectPlatform() { arch_t detected_platform = arch_t::px_arch; - int cpu_info[4]; - cpuid(cpu_info, 1); + int cpuInfo[4]; + cpuid(cpuInfo, 1); - bool avx512_support_cpu = cpu_info[1] & CPUID_AVX512_MASK; - bool os_uses_XSAVE_XSTORE = cpu_info[2] & EXC_OSXSAVE; + bool avx512_support_cpu = cpuInfo[1] & CPUID_AVX512_MASK; + bool os_uses_XSAVE_XSTORE = cpuInfo[2] & EXC_OSXSAVE; if (avx512_support_cpu && os_uses_XSAVE_XSTORE) { // Check if XMM state and YMM state are saved diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index cb39fa025b..7849e4d1a7 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -93,7 +93,7 @@ namespace orc { void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { uint64_t startBit = 0; #if ENABLE_AVX512 - if (detect_platform() == arch_t::avx512_arch) { + if (detectPlatform() == arch_t::avx512_arch) { switch (fbs) { case 1: unrolledUnpackVector1(data, offset, len); diff --git a/c++/src/VectorDecoder.hh b/c++/src/VectorDecoder.hh index 8100c9e698..c2629a3419 100644 --- a/c++/src/VectorDecoder.hh +++ b/c++/src/VectorDecoder.hh @@ -19,11 +19,11 @@ #ifndef VECTOR_DECODER_HH #define VECTOR_DECODER_HH +#if ENABLE_AVX512 #include #include namespace orc { -#if ENABLE_AVX512 #define ORC_VECTOR_BITS_2_BYTE(x) (((x) + 7u) >> 3u) /**< Convert a number of bits to a number of bytes */ #define ORC_VECTOR_ONE_64U (1ULL) #define ORC_VECTOR_MAX_16U 0xFFFF /**< Max value for uint16_t */ @@ -501,6 +501,6 @@ inline uint64_t moveLen(uint64_t x, uint64_t y) { } return result; } +} // namespace orc #endif -} #endif diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index f52690db57..ac3dd43e9a 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -21,15 +21,15 @@ #include "MemoryOutputStream.hh" #include "RLEv2.hh" +#ifdef __clang__ +DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") +DIAGNOSTIC_IGNORE("-Wclang-format-violations") +#endif + #include "wrap/orc-proto-wrapper.hh" #include "wrap/gtest-wrapper.h" #include -#ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") - DIAGNOSTIC_IGNORE("-Wclang-format-violations") -#endif - namespace orc { using ::testing::TestWithParam; From 668335cad84d101fc8f12821a08739497715b85a Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Sat, 14 Jan 2023 09:13:35 +0530 Subject: [PATCH 13/80] 1. Modified the code format; 2. Add the dynamiclly judge the current compiler and platform support AVX512 or not; 3. The build option BUILD_ENABLE_AVX512 default value change to "ON"; 4. Add the build option about file TestRleVectorDecoder.cc, and try to fix clang format build issue. --- CMakeLists.txt | 144 ++++++++++++++++++++++++++++++- c++/src/DetectPlatform.hh | 14 +-- c++/src/RleDecoderV2.cc | 2 +- c++/test/TestRleVectorDecoder.cc | 4 +- 4 files changed, 152 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35fd4ab038..a799ea767b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ option(BUILD_CPP_ENABLE_METRICS option(BUILD_ENABLE_AVX512 "Enable AVX512 vector decode of bit-packing" - OFF) + ON) # Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) @@ -91,6 +91,17 @@ if (BUILD_POSITION_INDEPENDENT_LIB) set(CMAKE_POSITION_INDEPENDENT_CODE ON) endif () +if(NOT DEFINED ORC_SIMD_LEVEL) + set(ORC_SIMD_LEVEL + "DEFAULT" + CACHE STRING "Compile time SIMD optimization level") +endif() +if(NOT DEFINED ORC_RUNTIME_SIMD_LEVEL) + set(ORC_RUNTIME_SIMD_LEVEL + "MAX" + CACHE STRING "Max runtime SIMD optimization level") +endif() + # # Compiler specific flags # @@ -161,6 +172,135 @@ elseif (MSVC) set (WARN_FLAGS "${WARN_FLAGS} -wd4146") # unary minus operator applied to unsigned type, result still unsigned endif () +include(CheckCXXCompilerFlag) +include(CheckCXXSourceCompiles) +message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") + +if(NOT DEFINED ORC_CPU_FLAG) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|X86|x86|i[3456]86|x64") + set(ORC_CPU_FLAG "x86") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") + set(ORC_CPU_FLAG "aarch64") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm$|armv[4-7]") + set(ORC_CPU_FLAG "aarch32") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "powerpc|ppc") + set(ORC_CPU_FLAG "ppc") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") + set(ORC_CPU_FLAG "s390x") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64") + set(ORC_CPU_FLAG "riscv64") + else() + message(FATAL_ERROR "Unknown system processor") + endif() +endif() + +# Check architecture specific compiler flags +if(ORC_CPU_FLAG STREQUAL "x86") + # x86/amd64 compiler flags, msvc/gcc/clang + if(MSVC) + set(ORC_SSE4_2_FLAG "") + set(ORC_AVX2_FLAG "/arch:AVX2") + set(ORC_AVX512_FLAG "/arch:AVX512") + set(CXX_SUPPORTS_SSE4_2 TRUE) + else() + set(ORC_SSE4_2_FLAG "-msse4.2") + set(ORC_AVX2_FLAG "-march=haswell") + # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ + set(ORC_AVX512_FLAG "-march=skylake-avx512 -mbmi2") + # Append the avx2/avx512 subset option also, fix issue ORC-9877 for homebrew-cpp + set(ORC_AVX2_FLAG "${ORC_AVX2_FLAG} -mavx2") + set(ORC_AVX512_FLAG + "${ORC_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi") + check_cxx_compiler_flag(${ORC_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) + endif() + check_cxx_compiler_flag(${ORC_AVX512_FLAG} CXX_SUPPORTS_AVX512) + if(MINGW) + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 + message(STATUS "Disable AVX512 support on MINGW for now") + else() + # Check for AVX512 support in the compiler. + set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ORC_AVX512_FLAG}") + check_cxx_source_compiles(" + #ifdef _MSC_VER + #include + #else + #include + #endif + + int main() { + __m512i mask = _mm512_set1_epi32(0x1); + char out[32]; + _mm512_storeu_si512(out, mask); + return 0; + }" + CXX_SUPPORTS_AVX512) + set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) + endif() + # Runtime SIMD level it can get from compiler and ORC_RUNTIME_SIMD_LEVEL + if(CXX_SUPPORTS_SSE4_2 AND ORC_RUNTIME_SIMD_LEVEL MATCHES + "^(SSE4_2|AVX2|AVX512|MAX)$") + set(ORC_HAVE_RUNTIME_SSE4_2 ON) + add_definitions(-DORC_HAVE_RUNTIME_SSE4_2) + endif() + if(CXX_SUPPORTS_AVX2 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(AVX2|AVX512|MAX)$") + set(ORC_HAVE_RUNTIME_AVX2 ON) + add_definitions(-DORC_HAVE_RUNTIME_AVX2 -DORC_HAVE_RUNTIME_BMI2) + endif() + if(CXX_SUPPORTS_AVX512 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(AVX512|MAX)$") + set(ORC_HAVE_RUNTIME_AVX512 ON) + add_definitions(-DORC_HAVE_RUNTIME_AVX512 -DORC_HAVE_RUNTIME_BMI2) + endif() + if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") + set(ORC_SIMD_LEVEL "AVX512") + endif() + +elseif(ORC_CPU_FLAG STREQUAL "ppc") + # power compiler flags, gcc/clang only + set(ORC_ALTIVEC_FLAG "-maltivec") + check_cxx_compiler_flag(${ORC_ALTIVEC_FLAG} CXX_SUPPORTS_ALTIVEC) + if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") + set(ORC_SIMD_LEVEL "NONE") + endif() +elseif(ORC_CPU_FLAG STREQUAL "aarch64") + # Arm64 compiler flags, gcc/clang only + set(ORC_ARMV8_MARCH "armv8-a") + check_cxx_compiler_flag("-march=${ORC_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE) + if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") + set(ORC_SIMD_LEVEL "NEON") + endif() +endif() + +# Only enable additional instruction sets if they are supported +if(ORC_CPU_FLAG STREQUAL "x86") + if(MINGW) + # Enable _xgetbv() intrinsic to query OS support for ZMM register saves + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mxsave") + endif() + if(ORC_SIMD_LEVEL STREQUAL "AVX512") + if(NOT CXX_SUPPORTS_AVX512) + message(FATAL_ERROR "AVX512 required but compiler doesn't support it.") + endif() + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ORC_AVX512_FLAG}") + add_definitions(-DORC_HAVE_AVX512 -DORC_HAVE_AVX2 -DORC_HAVE_BMI2 + -DORC_HAVE_SSE4_2) + elseif(ORC_SIMD_LEVEL STREQUAL "AVX2") + if(NOT CXX_SUPPORTS_AVX2) + message(FATAL_ERROR "AVX2 required but compiler doesn't support it.") + endif() + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ORC_AVX2_FLAG}") + add_definitions(-DORC_HAVE_AVX2 -DORC_HAVE_BMI2 -DORC_HAVE_SSE4_2) + elseif(ORC_SIMD_LEVEL STREQUAL "SSE4_2") + if(NOT CXX_SUPPORTS_SSE4_2) + message(FATAL_ERROR "SSE4.2 required but compiler doesn't support it.") + endif() + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ORC_SSE4_2_FLAG}") + add_definitions(-DORC_HAVE_SSE4_2) + elseif(NOT ORC_SIMD_LEVEL STREQUAL "NONE") + message(WARNING "ORC_SIMD_LEVEL=${ORC_SIMD_LEVEL} not supported by x86.") + endif() +endif() + if (BUILD_CPP_ENABLE_METRICS) message(STATUS "Enable the metrics collection") add_compile_definitions(ENABLE_METRICS=1) @@ -169,7 +309,7 @@ else () add_compile_definitions(ENABLE_METRICS=0) endif () -if (BUILD_ENABLE_AVX512) +if (BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512 AND ORC_SIMD_LEVEL STREQUAL "AVX512") message(STATUS "Enable the AVX512 vector decode of bit-packing") add_compile_definitions(ENABLE_AVX512=1) else () diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh index bbec1402f2..688cd19177 100644 --- a/c++/src/DetectPlatform.hh +++ b/c++/src/DetectPlatform.hh @@ -60,14 +60,14 @@ namespace orc #define CPUID_AVX512_MASK (CPUID_AVX512F | CPUID_AVX512CD | CPUID_AVX512VL | CPUID_AVX512BW | CPUID_AVX512DQ) - enum arch_t { - px_arch = 0, - avx2_arch = 1, - avx512_arch = 2 + enum class Arch { + PX_ARCH = 0, + AVX2_ARCH = 1, + AVX512_ARCH = 2 }; - arch_t detectPlatform() { - arch_t detected_platform = arch_t::px_arch; + Arch detectPlatform() { + Arch detected_platform = Arch::PX_ARCH; int cpuInfo[4]; cpuid(cpuInfo, 1); @@ -84,7 +84,7 @@ namespace orc if ((xcr_feature_mask & 0x6) == 0x6) { // AVX2 is supported now if ((xcr_feature_mask & 0xe0) == 0xe0) { // AVX512 is supported now - detected_platform = arch_t::avx512_arch; + detected_platform = Arch::AVX512_ARCH; } } } diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 7849e4d1a7..422bb850b1 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -93,7 +93,7 @@ namespace orc { void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { uint64_t startBit = 0; #if ENABLE_AVX512 - if (detectPlatform() == arch_t::avx512_arch) { + if (detectPlatform() == Arch::AVX512_ARCH) { switch (fbs) { case 1: unrolledUnpackVector1(data, offset, len); diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index ac3dd43e9a..292368c831 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -22,8 +22,8 @@ #include "RLEv2.hh" #ifdef __clang__ -DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") -DIAGNOSTIC_IGNORE("-Wclang-format-violations") +#pragma clang diagnostic ignored "-Wmissing-variable-declarations" +#pragma clang diagnostic ignored "-Wclang-format-violations" #endif #include "wrap/orc-proto-wrapper.hh" From 46daa2db5987f1646f4add5a6701d9454194388b Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Sat, 14 Jan 2023 14:49:56 +0530 Subject: [PATCH 14/80] 1. Use clang-format to modify the code format of TestRleVectorDecoder.cc 2. Change CMakeLists.txt some options --- CMakeLists.txt | 3 +-- c++/test/TestRleVectorDecoder.cc | 10 ++++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a799ea767b..01c1aaddf4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -206,12 +206,11 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(ORC_SSE4_2_FLAG "-msse4.2") set(ORC_AVX2_FLAG "-march=haswell") # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ - set(ORC_AVX512_FLAG "-march=skylake-avx512 -mbmi2") + set(ORC_AVX512_FLAG "-march=native -mbmi2") # Append the avx2/avx512 subset option also, fix issue ORC-9877 for homebrew-cpp set(ORC_AVX2_FLAG "${ORC_AVX2_FLAG} -mavx2") set(ORC_AVX512_FLAG "${ORC_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi") - check_cxx_compiler_flag(${ORC_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) endif() check_cxx_compiler_flag(${ORC_AVX512_FLAG} CXX_SUPPORTS_AVX512) if(MINGW) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 292368c831..42885b73a1 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -20,16 +20,14 @@ #include "MemoryOutputStream.hh" #include "RLEv2.hh" - -#ifdef __clang__ -#pragma clang diagnostic ignored "-Wmissing-variable-declarations" -#pragma clang diagnostic ignored "-Wclang-format-violations" -#endif - #include "wrap/orc-proto-wrapper.hh" #include "wrap/gtest-wrapper.h" #include +#ifdef __clang__ +DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") +#endif + namespace orc { using ::testing::TestWithParam; From 415d1eb43935721af3acce4bdea169c7ab157ef7 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Sun, 15 Jan 2023 07:05:49 +0530 Subject: [PATCH 15/80] 1. Use clang-format -style=google to format code style of TestRleVectorDecoder.cc 2. Change the option CXX_COMMON_FLAGS to CMAKE_CXX_FLAGS --- CMakeLists.txt | 8 +- c++/test/TestRleVectorDecoder.cc | 942 +++++++++++++++---------------- 2 files changed, 462 insertions(+), 488 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 01c1aaddf4..55cf0a14a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -274,26 +274,26 @@ endif() if(ORC_CPU_FLAG STREQUAL "x86") if(MINGW) # Enable _xgetbv() intrinsic to query OS support for ZMM register saves - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mxsave") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mxsave") endif() if(ORC_SIMD_LEVEL STREQUAL "AVX512") if(NOT CXX_SUPPORTS_AVX512) message(FATAL_ERROR "AVX512 required but compiler doesn't support it.") endif() - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ORC_AVX512_FLAG}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_AVX512_FLAG}") add_definitions(-DORC_HAVE_AVX512 -DORC_HAVE_AVX2 -DORC_HAVE_BMI2 -DORC_HAVE_SSE4_2) elseif(ORC_SIMD_LEVEL STREQUAL "AVX2") if(NOT CXX_SUPPORTS_AVX2) message(FATAL_ERROR "AVX2 required but compiler doesn't support it.") endif() - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ORC_AVX2_FLAG}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_AVX2_FLAG}") add_definitions(-DORC_HAVE_AVX2 -DORC_HAVE_BMI2 -DORC_HAVE_SSE4_2) elseif(ORC_SIMD_LEVEL STREQUAL "SSE4_2") if(NOT CXX_SUPPORTS_SSE4_2) message(FATAL_ERROR "SSE4.2 required but compiler doesn't support it.") endif() - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ORC_SSE4_2_FLAG}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_SSE4_2_FLAG}") add_definitions(-DORC_HAVE_SSE4_2) elseif(NOT ORC_SIMD_LEVEL STREQUAL "NONE") message(WARNING "ORC_SIMD_LEVEL=${ORC_SIMD_LEVEL} not supported by x86.") diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 42885b73a1..f9b9480957 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -16,13 +16,14 @@ * limitations under the License. */ +#include + #include #include "MemoryOutputStream.hh" #include "RLEv2.hh" -#include "wrap/orc-proto-wrapper.hh" #include "wrap/gtest-wrapper.h" -#include +#include "wrap/orc-proto-wrapper.hh" #ifdef __clang__ DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") @@ -30,610 +31,583 @@ DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") namespace orc { - using ::testing::TestWithParam; - using ::testing::Values; - - const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M - - - class RleVectorTest : public TestWithParam { - virtual void SetUp(); - - protected: - bool alignBitpacking; - std::unique_ptr getEncoder(RleVersion version, - MemoryOutputStream& memStream, - bool isSigned); - - void runExampleTest(int64_t* inputData, uint64_t inputLength, - unsigned char* expectedOutput, uint64_t outputLength); - - void runTest(RleVersion version, - uint64_t numValues, - int64_t start, - int64_t delta, - bool random, - bool isSigned, - uint8_t bitWidth, - uint64_t blockSize = 0, - uint64_t numNulls = 0); - }; - - void vectorDecodeAndVerify( - RleVersion version, - const MemoryOutputStream& memStream, - int64_t * data, - uint64_t numValues, - const char* notNull, - uint64_t blockSize, - bool isSinged) { - std::unique_ptr decoder = createRleDecoder( - std::unique_ptr(new SeekableArrayInputStream( - memStream.getData(), - memStream.getLength(), blockSize)), - isSinged, version, *getDefaultPool(), - getDefaultReaderMetrics()); - - int64_t* decodedData = new int64_t[numValues]; - decoder->next(decodedData, numValues, notNull); - - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - EXPECT_EQ(data[i], decodedData[i]); - } +using ::testing::TestWithParam; +using ::testing::Values; + +const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + +class RleVectorTest : public TestWithParam { + virtual void SetUp(); + + protected: + bool alignBitpacking; + std::unique_ptr getEncoder(RleVersion version, + MemoryOutputStream& memStream, + bool isSigned); + + void runExampleTest(int64_t* inputData, uint64_t inputLength, + unsigned char* expectedOutput, uint64_t outputLength); + + void runTest(RleVersion version, uint64_t numValues, int64_t start, + int64_t delta, bool random, bool isSigned, uint8_t bitWidth, + uint64_t blockSize = 0, uint64_t numNulls = 0); +}; + +void vectorDecodeAndVerify(RleVersion version, + const MemoryOutputStream& memStream, int64_t* data, + uint64_t numValues, const char* notNull, + uint64_t blockSize, bool isSinged) { + std::unique_ptr decoder = createRleDecoder( + std::unique_ptr(new SeekableArrayInputStream( + memStream.getData(), memStream.getLength(), blockSize)), + isSinged, version, *getDefaultPool(), getDefaultReaderMetrics()); + + int64_t* decodedData = new int64_t[numValues]; + decoder->next(decodedData, numValues, notNull); + + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + EXPECT_EQ(data[i], decodedData[i]); } + } + + delete[] decodedData; +} - delete [] decodedData; - } - - void RleVectorTest::SetUp() { - alignBitpacking = GetParam(); - } - - void generateDataFolBits( - uint64_t numValues, - int64_t start, - int64_t delta, - bool random, - int64_t* data, - uint8_t bitWidth, - uint64_t numNulls = 0, - char* notNull = nullptr) { - int64_t max = pow(2, bitWidth); - if (numNulls != 0 && notNull != nullptr) { - memset(notNull, 1, numValues); - while (numNulls > 0) { - uint64_t pos = static_cast(std::rand()) % numValues; - if (notNull[pos]) { - notNull[pos] = static_cast(0); - --numNulls; - } +void RleVectorTest::SetUp() { alignBitpacking = GetParam(); } + +void generateDataFolBits(uint64_t numValues, int64_t start, int64_t delta, + bool random, int64_t* data, uint8_t bitWidth, + uint64_t numNulls = 0, char* notNull = nullptr) { + int64_t max = pow(2, bitWidth); + if (numNulls != 0 && notNull != nullptr) { + memset(notNull, 1, numValues); + while (numNulls > 0) { + uint64_t pos = static_cast(std::rand()) % numValues; + if (notNull[pos]) { + notNull[pos] = static_cast(0); + --numNulls; } } + } - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - if (!random) { - data[i] = start + delta * static_cast(i); - } else { - data[i] = std::rand()%max; - } + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + if (!random) { + data[i] = start + delta * static_cast(i); + } else { + data[i] = std::rand() % max; } } } +} #define BARSTR "##################################################" #define BARWIDTH 50 - void testProgress(const char* testName, int64_t offset, int64_t total) { - int32_t val = offset * 100 / total; - int32_t lpad = offset * BARWIDTH / total; - int32_t rpad = BARWIDTH - lpad; - - printf("\r%s:%3d%% [%.*s%*s] [%" PRId64 "/%" PRId64 "]", testName, val, lpad, BARSTR, rpad, "", offset, total); - fflush(stdout); - } - - std::unique_ptr RleVectorTest::getEncoder(RleVersion version, - MemoryOutputStream& memStream, - bool isSigned) - { - MemoryPool * pool = getDefaultPool(); - - return createRleEncoder( - std::unique_ptr( - new BufferedOutputStream( - *pool, &memStream, 500 * 1024, 1024, nullptr)), - isSigned, version, *pool, alignBitpacking); - } - - void RleVectorTest::runTest(RleVersion version, - uint64_t numValues, - int64_t start, - int64_t delta, - bool random, - bool isSigned, - uint8_t bitWidth, - uint64_t blockSize, - uint64_t numNulls) { - MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); +void testProgress(const char* testName, int64_t offset, int64_t total) { + int32_t val = offset * 100 / total; + int32_t lpad = offset * BARWIDTH / total; + int32_t rpad = BARWIDTH - lpad; + + printf("\r%s:%3d%% [%.*s%*s] [%" PRId64 "/%" PRId64 "]", testName, val, lpad, + BARSTR, rpad, "", offset, total); + fflush(stdout); +} - std::unique_ptr encoder = getEncoder(version, memStream, isSigned); +std::unique_ptr RleVectorTest::getEncoder( + RleVersion version, MemoryOutputStream& memStream, bool isSigned) { + MemoryPool* pool = getDefaultPool(); - char* notNull = numNulls == 0 ? nullptr : new char[numValues]; - int64_t* data = new int64_t[numValues]; - generateDataFolBits(numValues, start, delta, random, data, bitWidth, numNulls, notNull); - encoder->add(data, numValues, notNull); - encoder->flush(); + return createRleEncoder( + std::unique_ptr(new BufferedOutputStream( + *pool, &memStream, 500 * 1024, 1024, nullptr)), + isSigned, version, *pool, alignBitpacking); +} - vectorDecodeAndVerify(version, memStream, data, numValues, notNull, blockSize, isSigned); - delete [] data; - delete [] notNull; - } +void RleVectorTest::runTest(RleVersion version, uint64_t numValues, + int64_t start, int64_t delta, bool random, + bool isSigned, uint8_t bitWidth, uint64_t blockSize, + uint64_t numNulls) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + + std::unique_ptr encoder = + getEncoder(version, memStream, isSigned); + + char* notNull = numNulls == 0 ? nullptr : new char[numValues]; + int64_t* data = new int64_t[numValues]; + generateDataFolBits(numValues, start, delta, random, data, bitWidth, numNulls, + notNull); + encoder->add(data, numValues, notNull); + encoder->flush(); + + vectorDecodeAndVerify(version, memStream, data, numValues, notNull, blockSize, + isSigned); + delete[] data; + delete[] notNull; +} #if ENABLE_AVX512 - TEST_P(RleVectorTest, RleV2_basic_vector_decode_1bit) { - uint8_t bitWidth = 1; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("1bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_1bit) { + uint8_t bitWidth = 1; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("1bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("1bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("1bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_2bit) { - uint8_t bitWidth = 2; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("2bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_2bit) { + uint8_t bitWidth = 2; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("2bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("2bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("2bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_3bit) { - uint8_t bitWidth = 3; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("3bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_3bit) { + uint8_t bitWidth = 3; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("3bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("3bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("3bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_4bit) { - uint8_t bitWidth = 4; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("4bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_4bit) { + uint8_t bitWidth = 4; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("4bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("4bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("4bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_5bit) { - uint8_t bitWidth = 5; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("5bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_5bit) { + uint8_t bitWidth = 5; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("5bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("5bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("5bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_6bit) { - uint8_t bitWidth = 6; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("6bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_6bit) { + uint8_t bitWidth = 6; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("6bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("6bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("6bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_7bit) { - uint8_t bitWidth = 7; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("7bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_7bit) { + uint8_t bitWidth = 7; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("7bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("7bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("7bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_9bit) { - uint8_t bitWidth = 9; +TEST_P(RleVectorTest, RleV2_basic_vector_decode_9bit) { + uint8_t bitWidth = 9; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("9bit Test 1st Part", blockSize, 10000); - } - printf("\n"); + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("9bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("9bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("9bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_10bit) { - uint8_t bitWidth = 10; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("10bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_10bit) { + uint8_t bitWidth = 10; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("10bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("10bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("10bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_11bit) { - uint8_t bitWidth = 11; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("11bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_11bit) { + uint8_t bitWidth = 11; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("11bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("11bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("11bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_12bit) { - uint8_t bitWidth = 12; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("12bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_12bit) { + uint8_t bitWidth = 12; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("12bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("12bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("12bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_13bit) { - uint8_t bitWidth = 13; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("13bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_13bit) { + uint8_t bitWidth = 13; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("13bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("13bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("13bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_14bit) { - uint8_t bitWidth = 14; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("14bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_14bit) { + uint8_t bitWidth = 14; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("14bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("14bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("14bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_15bit) { - uint8_t bitWidth = 15; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("15bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_15bit) { + uint8_t bitWidth = 15; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("15bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("15bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("15bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_16bit) { - uint8_t bitWidth = 16; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("16bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_16bit) { + uint8_t bitWidth = 16; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("16bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("16bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("16bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_17bit) { - uint8_t bitWidth = 17; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("17bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_17bit) { + uint8_t bitWidth = 17; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("17bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("17bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("17bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} +TEST_P(RleVectorTest, RleV2_basic_vector_decode_18bit) { + uint8_t bitWidth = 18; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("18bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - TEST_P(RleVectorTest, RleV2_basic_vector_decode_18bit) { - uint8_t bitWidth = 18; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("18bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("18bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); +} - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("18bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_19bit) { + uint8_t bitWidth = 19; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("19bit Test 1st Part", blockSize, 10000); } + printf("\n"); - TEST_P(RleVectorTest, RleV2_basic_vector_decode_19bit) { - uint8_t bitWidth = 19; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("19bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("19bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); +} - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("19bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_20bit) { + uint8_t bitWidth = 20; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("20bit Test 1st Part", blockSize, 10000); } + printf("\n"); - TEST_P(RleVectorTest, RleV2_basic_vector_decode_20bit) { - uint8_t bitWidth = 20; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("20bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("20bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); +} - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("20bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_21bit) { + uint8_t bitWidth = 21; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("21bit Test 1st Part", blockSize, 10000); } + printf("\n"); - TEST_P(RleVectorTest, RleV2_basic_vector_decode_21bit) { - uint8_t bitWidth = 21; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("21bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("21bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); +} - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("21bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_22bit) { + uint8_t bitWidth = 22; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("22bit Test 1st Part", blockSize, 10000); } + printf("\n"); - TEST_P(RleVectorTest, RleV2_basic_vector_decode_22bit) { - uint8_t bitWidth = 22; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("22bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("22bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); +} - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("22bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_23bit) { + uint8_t bitWidth = 23; + runTest(RleVersion_2, 3277, 0, 0, true, false, bitWidth, 108); + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("23bit Test 1st Part", blockSize, 10000); } + printf("\n"); - TEST_P(RleVectorTest, RleV2_basic_vector_decode_23bit) { - uint8_t bitWidth = 23; - runTest(RleVersion_2, 3277, 0, 0, true, false, bitWidth, 108); - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("23bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("23bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); +} - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("23bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); - } - - TEST_P(RleVectorTest, RleV2_basic_vector_decode_24bit) { - uint8_t bitWidth = 24; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("24bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_24bit) { + uint8_t bitWidth = 24; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("24bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("24bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("24bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_26bit) { - uint8_t bitWidth = 26; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("26bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_26bit) { + uint8_t bitWidth = 26; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("26bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("26bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("26bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_28bit) { - uint8_t bitWidth = 28; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("28bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_28bit) { + uint8_t bitWidth = 28; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("28bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("28bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("28bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_30bit) { - uint8_t bitWidth = 30; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("30bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_30bit) { + uint8_t bitWidth = 30; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("30bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("30bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("30bit Test 2nd Part", blockSize, 10000); } + printf("\n"); +} - TEST_P(RleVectorTest, RleV2_basic_vector_decode_32bit) { - uint8_t bitWidth = 32; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("32bit Test 1st Part", blockSize, 10000); - } - printf("\n"); +TEST_P(RleVectorTest, RleV2_basic_vector_decode_32bit) { + uint8_t bitWidth = 32; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("32bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize +=1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("32bit Test 2nd Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printf("\n"); + testProgress("32bit Test 2nd Part", blockSize, 10000); } - - INSTANTIATE_TEST_SUITE_P(OrcTest, RleVectorTest, Values(true,false)); -#endif + printf("\n"); } +INSTANTIATE_TEST_SUITE_P(OrcTest, RleVectorTest, Values(true, false)); +#endif +} // namespace orc From cd2f71d5f9a4c563f61621ba3561e772299d3398 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 29 Jan 2023 22:46:44 -0500 Subject: [PATCH 16/80] 1. Use clang-format to modify the code style of c++/test/TestRleVectorDecoder.cc; 2. Temp change Enable_AVX512 to OFF --- CMakeLists.txt | 2 +- c++/test/TestRleVectorDecoder.cc | 907 +++++++++++++++---------------- 2 files changed, 452 insertions(+), 457 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 55cf0a14a6..702c5dcb33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ option(BUILD_CPP_ENABLE_METRICS option(BUILD_ENABLE_AVX512 "Enable AVX512 vector decode of bit-packing" - ON) + OFF) # Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index f9b9480957..c3a3fc635e 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -31,583 +31,578 @@ DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") namespace orc { -using ::testing::TestWithParam; -using ::testing::Values; - -const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M - -class RleVectorTest : public TestWithParam { - virtual void SetUp(); - - protected: - bool alignBitpacking; - std::unique_ptr getEncoder(RleVersion version, - MemoryOutputStream& memStream, - bool isSigned); - - void runExampleTest(int64_t* inputData, uint64_t inputLength, - unsigned char* expectedOutput, uint64_t outputLength); - - void runTest(RleVersion version, uint64_t numValues, int64_t start, - int64_t delta, bool random, bool isSigned, uint8_t bitWidth, - uint64_t blockSize = 0, uint64_t numNulls = 0); -}; - -void vectorDecodeAndVerify(RleVersion version, - const MemoryOutputStream& memStream, int64_t* data, - uint64_t numValues, const char* notNull, - uint64_t blockSize, bool isSinged) { - std::unique_ptr decoder = createRleDecoder( - std::unique_ptr(new SeekableArrayInputStream( - memStream.getData(), memStream.getLength(), blockSize)), - isSinged, version, *getDefaultPool(), getDefaultReaderMetrics()); - - int64_t* decodedData = new int64_t[numValues]; - decoder->next(decodedData, numValues, notNull); - - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - EXPECT_EQ(data[i], decodedData[i]); - } - } - - delete[] decodedData; -} - -void RleVectorTest::SetUp() { alignBitpacking = GetParam(); } - -void generateDataFolBits(uint64_t numValues, int64_t start, int64_t delta, - bool random, int64_t* data, uint8_t bitWidth, - uint64_t numNulls = 0, char* notNull = nullptr) { - int64_t max = pow(2, bitWidth); - if (numNulls != 0 && notNull != nullptr) { - memset(notNull, 1, numValues); - while (numNulls > 0) { - uint64_t pos = static_cast(std::rand()) % numValues; - if (notNull[pos]) { - notNull[pos] = static_cast(0); - --numNulls; + using ::testing::TestWithParam; + using ::testing::Values; + + const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + + class RleVectorTest : public TestWithParam { + virtual void SetUp(); + + protected: + bool alignBitpacking; + std::unique_ptr getEncoder(RleVersion version, MemoryOutputStream& memStream, + bool isSigned); + + void runExampleTest(int64_t* inputData, uint64_t inputLength, unsigned char* expectedOutput, + uint64_t outputLength); + + void runTest(RleVersion version, uint64_t numValues, int64_t start, int64_t delta, bool random, + bool isSigned, uint8_t bitWidth, uint64_t blockSize = 0, uint64_t numNulls = 0); + }; + + void vectorDecodeAndVerify(RleVersion version, const MemoryOutputStream& memStream, int64_t* data, + uint64_t numValues, const char* notNull, uint64_t blockSize, + bool isSinged) { + std::unique_ptr decoder = + createRleDecoder(std::unique_ptr(new SeekableArrayInputStream( + memStream.getData(), memStream.getLength(), blockSize)), + isSinged, version, *getDefaultPool(), getDefaultReaderMetrics()); + + int64_t* decodedData = new int64_t[numValues]; + decoder->next(decodedData, numValues, notNull); + + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + EXPECT_EQ(data[i], decodedData[i]); } } + + delete[] decodedData; + } + + void RleVectorTest::SetUp() { + alignBitpacking = GetParam(); } - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - if (!random) { - data[i] = start + delta * static_cast(i); - } else { - data[i] = std::rand() % max; + void generateDataFolBits(uint64_t numValues, int64_t start, int64_t delta, bool random, + int64_t* data, uint8_t bitWidth, uint64_t numNulls = 0, + char* notNull = nullptr) { + int64_t max = pow(2, bitWidth); + if (numNulls != 0 && notNull != nullptr) { + memset(notNull, 1, numValues); + while (numNulls > 0) { + uint64_t pos = static_cast(std::rand()) % numValues; + if (notNull[pos]) { + notNull[pos] = static_cast(0); + --numNulls; + } + } + } + + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + if (!random) { + data[i] = start + delta * static_cast(i); + } else { + data[i] = std::rand() % max; + } } } } -} #define BARSTR "##################################################" #define BARWIDTH 50 -void testProgress(const char* testName, int64_t offset, int64_t total) { - int32_t val = offset * 100 / total; - int32_t lpad = offset * BARWIDTH / total; - int32_t rpad = BARWIDTH - lpad; - - printf("\r%s:%3d%% [%.*s%*s] [%" PRId64 "/%" PRId64 "]", testName, val, lpad, - BARSTR, rpad, "", offset, total); - fflush(stdout); -} - -std::unique_ptr RleVectorTest::getEncoder( - RleVersion version, MemoryOutputStream& memStream, bool isSigned) { - MemoryPool* pool = getDefaultPool(); - - return createRleEncoder( - std::unique_ptr(new BufferedOutputStream( - *pool, &memStream, 500 * 1024, 1024, nullptr)), - isSigned, version, *pool, alignBitpacking); -} - -void RleVectorTest::runTest(RleVersion version, uint64_t numValues, - int64_t start, int64_t delta, bool random, - bool isSigned, uint8_t bitWidth, uint64_t blockSize, - uint64_t numNulls) { - MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); - - std::unique_ptr encoder = - getEncoder(version, memStream, isSigned); - - char* notNull = numNulls == 0 ? nullptr : new char[numValues]; - int64_t* data = new int64_t[numValues]; - generateDataFolBits(numValues, start, delta, random, data, bitWidth, numNulls, - notNull); - encoder->add(data, numValues, notNull); - encoder->flush(); - - vectorDecodeAndVerify(version, memStream, data, numValues, notNull, blockSize, - isSigned); - delete[] data; - delete[] notNull; -} + void testProgress(const char* testName, int64_t offset, int64_t total) { + int32_t val = offset * 100 / total; + int32_t lpad = offset * BARWIDTH / total; + int32_t rpad = BARWIDTH - lpad; -#if ENABLE_AVX512 -TEST_P(RleVectorTest, RleV2_basic_vector_decode_1bit) { - uint8_t bitWidth = 1; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("1bit Test 1st Part", blockSize, 10000); + printf("\r%s:%3d%% [%.*s%*s] [%" PRId64 "/%" PRId64 "]", testName, val, lpad, BARSTR, rpad, "", + offset, total); + fflush(stdout); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); - } - testProgress("1bit Test 2nd Part", blockSize, 10000); + std::unique_ptr RleVectorTest::getEncoder(RleVersion version, + MemoryOutputStream& memStream, + bool isSigned) { + MemoryPool* pool = getDefaultPool(); + + return createRleEncoder(std::unique_ptr(new BufferedOutputStream( + *pool, &memStream, 500 * 1024, 1024, nullptr)), + isSigned, version, *pool, alignBitpacking); } - printf("\n"); -} -TEST_P(RleVectorTest, RleV2_basic_vector_decode_2bit) { - uint8_t bitWidth = 2; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("2bit Test 1st Part", blockSize, 10000); + void RleVectorTest::runTest(RleVersion version, uint64_t numValues, int64_t start, int64_t delta, + bool random, bool isSigned, uint8_t bitWidth, uint64_t blockSize, + uint64_t numNulls) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + + std::unique_ptr encoder = getEncoder(version, memStream, isSigned); + + char* notNull = numNulls == 0 ? nullptr : new char[numValues]; + int64_t* data = new int64_t[numValues]; + generateDataFolBits(numValues, start, delta, random, data, bitWidth, numNulls, notNull); + encoder->add(data, numValues, notNull); + encoder->flush(); + + vectorDecodeAndVerify(version, memStream, data, numValues, notNull, blockSize, isSigned); + delete[] data; + delete[] notNull; } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); +#if ENABLE_AVX512 + TEST_P(RleVectorTest, RleV2_basic_vector_decode_1bit) { + uint8_t bitWidth = 1; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("1bit Test 1st Part", blockSize, 10000); } - testProgress("2bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_3bit) { - uint8_t bitWidth = 3; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("3bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("1bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_2bit) { + uint8_t bitWidth = 2; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("2bit Test 1st Part", blockSize, 10000); } - testProgress("3bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_4bit) { - uint8_t bitWidth = 4; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("4bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("2bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_3bit) { + uint8_t bitWidth = 3; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("3bit Test 1st Part", blockSize, 10000); } - testProgress("4bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_5bit) { - uint8_t bitWidth = 5; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("5bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("3bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_4bit) { + uint8_t bitWidth = 4; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("4bit Test 1st Part", blockSize, 10000); } - testProgress("5bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_6bit) { - uint8_t bitWidth = 6; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("6bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("4bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_5bit) { + uint8_t bitWidth = 5; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("5bit Test 1st Part", blockSize, 10000); } - testProgress("6bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_7bit) { - uint8_t bitWidth = 7; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("7bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("5bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_6bit) { + uint8_t bitWidth = 6; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("6bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("6bit Test 2nd Part", blockSize, 10000); } - testProgress("7bit Test 2nd Part", blockSize, 10000); + printf("\n"); } - printf("\n"); -} -TEST_P(RleVectorTest, RleV2_basic_vector_decode_9bit) { - uint8_t bitWidth = 9; + TEST_P(RleVectorTest, RleV2_basic_vector_decode_7bit) { + uint8_t bitWidth = 7; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("7bit Test 1st Part", blockSize, 10000); + } + printf("\n"); - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("9bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("7bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_9bit) { + uint8_t bitWidth = 9; + + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("9bit Test 1st Part", blockSize, 10000); } - testProgress("9bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_10bit) { - uint8_t bitWidth = 10; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("10bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("9bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_10bit) { + uint8_t bitWidth = 10; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("10bit Test 1st Part", blockSize, 10000); } - testProgress("10bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_11bit) { - uint8_t bitWidth = 11; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("11bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("10bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_11bit) { + uint8_t bitWidth = 11; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("11bit Test 1st Part", blockSize, 10000); } - testProgress("11bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_12bit) { - uint8_t bitWidth = 12; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("12bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("11bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_12bit) { + uint8_t bitWidth = 12; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("12bit Test 1st Part", blockSize, 10000); } - testProgress("12bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_13bit) { - uint8_t bitWidth = 13; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("13bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("12bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_13bit) { + uint8_t bitWidth = 13; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("13bit Test 1st Part", blockSize, 10000); } - testProgress("13bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_14bit) { - uint8_t bitWidth = 14; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("14bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("13bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_14bit) { + uint8_t bitWidth = 14; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("14bit Test 1st Part", blockSize, 10000); } - testProgress("14bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_15bit) { - uint8_t bitWidth = 15; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("15bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("14bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_15bit) { + uint8_t bitWidth = 15; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("15bit Test 1st Part", blockSize, 10000); } - testProgress("15bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_16bit) { - uint8_t bitWidth = 16; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("16bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("15bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_16bit) { + uint8_t bitWidth = 16; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("16bit Test 1st Part", blockSize, 10000); } - testProgress("16bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_17bit) { - uint8_t bitWidth = 17; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("17bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("16bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_17bit) { + uint8_t bitWidth = 17; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("17bit Test 1st Part", blockSize, 10000); } - testProgress("17bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_18bit) { - uint8_t bitWidth = 18; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("18bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("17bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_18bit) { + uint8_t bitWidth = 18; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("18bit Test 1st Part", blockSize, 10000); } - testProgress("18bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_19bit) { - uint8_t bitWidth = 19; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("19bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("18bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_19bit) { + uint8_t bitWidth = 19; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("19bit Test 1st Part", blockSize, 10000); } - testProgress("19bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_20bit) { - uint8_t bitWidth = 20; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("20bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("19bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_20bit) { + uint8_t bitWidth = 20; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("20bit Test 1st Part", blockSize, 10000); } - testProgress("20bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_21bit) { - uint8_t bitWidth = 21; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("21bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("20bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_21bit) { + uint8_t bitWidth = 21; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("21bit Test 1st Part", blockSize, 10000); } - testProgress("21bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_22bit) { - uint8_t bitWidth = 22; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("22bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("21bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_22bit) { + uint8_t bitWidth = 22; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("22bit Test 1st Part", blockSize, 10000); } - testProgress("22bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_23bit) { - uint8_t bitWidth = 23; - runTest(RleVersion_2, 3277, 0, 0, true, false, bitWidth, 108); - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("23bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("22bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_23bit) { + uint8_t bitWidth = 23; + runTest(RleVersion_2, 3277, 0, 0, true, false, bitWidth, 108); + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("23bit Test 1st Part", blockSize, 10000); } - testProgress("23bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_24bit) { - uint8_t bitWidth = 24; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("24bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("23bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_24bit) { + uint8_t bitWidth = 24; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("24bit Test 1st Part", blockSize, 10000); } - testProgress("24bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_26bit) { - uint8_t bitWidth = 26; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("26bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("24bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_26bit) { + uint8_t bitWidth = 26; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("26bit Test 1st Part", blockSize, 10000); } - testProgress("26bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_28bit) { - uint8_t bitWidth = 28; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("28bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("26bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_28bit) { + uint8_t bitWidth = 28; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("28bit Test 1st Part", blockSize, 10000); } - testProgress("28bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_30bit) { - uint8_t bitWidth = 30; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("30bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("28bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_30bit) { + uint8_t bitWidth = 30; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("30bit Test 1st Part", blockSize, 10000); } - testProgress("30bit Test 2nd Part", blockSize, 10000); - } - printf("\n"); -} + printf("\n"); -TEST_P(RleVectorTest, RleV2_basic_vector_decode_32bit) { - uint8_t bitWidth = 32; - for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { - runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("32bit Test 1st Part", blockSize, 10000); + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("30bit Test 2nd Part", blockSize, 10000); + } + printf("\n"); } - printf("\n"); - for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { - for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { - runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + TEST_P(RleVectorTest, RleV2_basic_vector_decode_32bit) { + uint8_t bitWidth = 32; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + testProgress("32bit Test 1st Part", blockSize, 10000); + } + printf("\n"); + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + testProgress("32bit Test 2nd Part", blockSize, 10000); } - testProgress("32bit Test 2nd Part", blockSize, 10000); + printf("\n"); } - printf("\n"); -} -INSTANTIATE_TEST_SUITE_P(OrcTest, RleVectorTest, Values(true, false)); + INSTANTIATE_TEST_SUITE_P(OrcTest, RleVectorTest, Values(true, false)); #endif } // namespace orc From f9ee0b433ad4be80dad262a3e5a1a8728c5ff28c Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 30 Jan 2023 18:55:38 -0500 Subject: [PATCH 17/80] Use clang-format to modify code style of files: c++/src/DetectPlatform.hh c++/src/RLEv2.hh c++/src/RleDecoderV2.cc c++/src/VectorDecoder.hh --- c++/src/DetectPlatform.hh | 52 +- c++/src/RLEv2.hh | 67 +-- c++/src/RleDecoderV2.cc | 971 ++++++++++++++++++++++---------------- c++/src/VectorDecoder.hh | 908 +++++++++++++++++------------------ 4 files changed, 1057 insertions(+), 941 deletions(-) diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh index 688cd19177..03fd158402 100644 --- a/c++/src/DetectPlatform.hh +++ b/c++/src/DetectPlatform.hh @@ -20,18 +20,17 @@ #define ORC_DETECTPLATFORM_HH #if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wold-style-cast") +DIAGNOSTIC_IGNORE("-Wold-style-cast") #endif -namespace orc -{ +namespace orc { #ifdef _WIN32 #include "intrin.h" // Windows CPUID -#define cpuid(info, x) __cpuidex(info, x, 0) +#define cpuid(info, x) __cpuidex(info, x, 0) #else -// GCC Intrinsics +// GCC Intrinsics #include #include @@ -41,56 +40,49 @@ namespace orc unsigned long long xgetbv(unsigned int index) { unsigned int eax, edx; - __asm__ __volatile__( - "xgetbv;" - : "=a" (eax), "=d"(edx) - : "c" (index) - ); - return ((unsigned long long) edx << 32) | eax; + __asm__ __volatile__("xgetbv;" : "=a"(eax), "=d"(edx) : "c"(index)); + return ((unsigned long long)edx << 32) | eax; } #endif - #define CPUID_AVX512F 0x00100000 - #define CPUID_AVX512CD 0x00200000 - #define CPUID_AVX512VL 0x04000000 - #define CPUID_AVX512BW 0x01000000 - #define CPUID_AVX512DQ 0x02000000 - #define EXC_OSXSAVE 0x08000000 // 27th bit +#define CPUID_AVX512F 0x00100000 +#define CPUID_AVX512CD 0x00200000 +#define CPUID_AVX512VL 0x04000000 +#define CPUID_AVX512BW 0x01000000 +#define CPUID_AVX512DQ 0x02000000 +#define EXC_OSXSAVE 0x08000000 // 27th bit - #define CPUID_AVX512_MASK (CPUID_AVX512F | CPUID_AVX512CD | CPUID_AVX512VL | CPUID_AVX512BW | CPUID_AVX512DQ) +#define CPUID_AVX512_MASK \ + (CPUID_AVX512F | CPUID_AVX512CD | CPUID_AVX512VL | CPUID_AVX512BW | CPUID_AVX512DQ) - enum class Arch { - PX_ARCH = 0, - AVX2_ARCH = 1, - AVX512_ARCH = 2 - }; + enum class Arch { PX_ARCH = 0, AVX2_ARCH = 1, AVX512_ARCH = 2 }; Arch detectPlatform() { Arch detected_platform = Arch::PX_ARCH; - int cpuInfo[4]; + int cpuInfo[4]; cpuid(cpuInfo, 1); - bool avx512_support_cpu = cpuInfo[1] & CPUID_AVX512_MASK; + bool avx512_support_cpu = cpuInfo[1] & CPUID_AVX512_MASK; bool os_uses_XSAVE_XSTORE = cpuInfo[2] & EXC_OSXSAVE; if (avx512_support_cpu && os_uses_XSAVE_XSTORE) { // Check if XMM state and YMM state are saved #ifdef _WIN32 - unsigned long long xcr_feature_mask = _xgetbv(0); /* min VS2010 SP1 compiler is required */ + unsigned long long xcr_feature_mask = _xgetbv(0); /* min VS2010 SP1 compiler is required */ #else unsigned long long xcr_feature_mask = xgetbv(0); #endif - if ((xcr_feature_mask & 0x6) == 0x6) { // AVX2 is supported now - if ((xcr_feature_mask & 0xe0) == 0xe0) { // AVX512 is supported now - detected_platform = Arch::AVX512_ARCH; + if ((xcr_feature_mask & 0x6) == 0x6) { // AVX2 is supported now + if ((xcr_feature_mask & 0xe0) == 0xe0) { // AVX512 is supported now + detected_platform = Arch::AVX512_ARCH; } } } return detected_platform; } -} +} // namespace orc #endif diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index b2654b14fe..e87398d946 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -199,37 +199,37 @@ namespace orc { int64_t readVslong(); uint64_t readVulong(); void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); - void plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, uint64_t fbs, - uint64_t& startBit); + void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, + uint64_t& startBit); #if ENABLE_AVX512 - void unrolledUnpackVector1(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector2(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector3(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector4(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector5(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector6(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector7(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector9(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector10(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector11(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector12(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector13(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector14(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector15(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector16(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector17(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector18(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector19(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector20(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector21(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector22(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector23(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector24(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector26(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector28(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector30(int64_t *data, uint64_t offset, uint64_t len); - void unrolledUnpackVector32(int64_t *data, uint64_t offset, uint64_t len); + void unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector6(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector7(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector9(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector10(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector11(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector12(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector13(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector14(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector15(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector16(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector17(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector18(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector19(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector20(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector21(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector22(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector23(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector24(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector26(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector28(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector30(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector32(int64_t* data, uint64_t offset, uint64_t len); #endif void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len); @@ -266,9 +266,12 @@ namespace orc { DataBuffer unpackedPatch; // Used by PATCHED_BASE DataBuffer literals; // Values of the current run #if ENABLE_AVX512 - uint8_t vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data - uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + 1]; // Used by vectorially 9~16 bit-unpacking data - uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + 1]; // Used by vectorially 17~32 bit-unpacking data + uint8_t + vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data + uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + + 1]; // Used by vectorially 9~16 bit-unpacking data + uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + + 1]; // Used by vectorially 17~32 bit-unpacking data #endif }; } // namespace orc diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 422bb850b1..c05a7cc1bf 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -18,11 +18,11 @@ #include "Adaptor.hh" #include "Compression.hh" +#include "DetectPlatform.hh" #include "RLEV2Util.hh" #include "RLEv2.hh" #include "Utils.hh" #include "VectorDecoder.hh" -#include "DetectPlatform.hh" namespace orc { void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { @@ -271,7 +271,7 @@ namespace orc { #if ENABLE_AVX512 void RleDecoderV2::unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 1; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint32_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -283,7 +283,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -294,9 +295,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -317,9 +321,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -329,7 +334,7 @@ namespace orc { if (numElements >= 64) { __m512i reverseMask1u = _mm512_load_si512(reverseMaskTable1u); while (numElements >= 64) { - uint64_t src_64 = *(uint64_t *)srcPtr; + uint64_t src_64 = *(uint64_t*)srcPtr; // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF __m512i srcmm = _mm512_movm_epi8(src_64); // make 0x00 --> 0x00, 0xFF --> 0x01 @@ -349,12 +354,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -376,13 +382,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } void RleDecoderV2::unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 2; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint32_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -394,7 +400,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -405,9 +412,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -428,9 +438,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -438,8 +449,8 @@ namespace orc { } if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements) - __m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0 + __mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements) + __m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0 while (numElements >= 64) { __m512i srcmm3 = _mm512_maskz_loadu_epi8(readMask, srcPtr); __m512i srcmm0, srcmm1, srcmm2, tmpmm; @@ -453,18 +464,18 @@ namespace orc { // srcmm0: a e i m 0 0 0 0 0 0 0 0 0 0 0 0 // srcmm1: b f j n 0 0 0 0 0 0 0 0 0 0 0 0 tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 00 00 00 00 - srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // ij mn 00 00 00 00 00 00 - srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x00); // ab ef ab ef ij mn ij mn + srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // ij mn 00 00 00 00 00 00 + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x00); // ab ef ab ef ij mn ij mn // srcmm2: c g k o 0 0 0 0 0 0 0 0 0 0 0 0 // srcmm3: d h l p 0 0 0 0 0 0 0 0 0 0 0 0 tmpmm = _mm512_unpacklo_epi8(srcmm2, srcmm3); // cd gh 00 00 00 00 00 00 - srcmm1 = _mm512_unpackhi_epi8(srcmm2, srcmm3); // kl op 00 00 00 00 00 00 - srcmm1 = _mm512_shuffle_i64x2(tmpmm, srcmm1, 0x00); // cd gh cd gh kl op kl op + srcmm1 = _mm512_unpackhi_epi8(srcmm2, srcmm3); // kl op 00 00 00 00 00 00 + srcmm1 = _mm512_shuffle_i64x2(tmpmm, srcmm1, 0x00); // cd gh cd gh kl op kl op - tmpmm = _mm512_unpacklo_epi16(srcmm0, srcmm1); // abcd abcd ijkl ijkl + tmpmm = _mm512_unpacklo_epi16(srcmm0, srcmm1); // abcd abcd ijkl ijkl srcmm0 = _mm512_unpackhi_epi16(srcmm0, srcmm1); // efgh efgh mnop mnop - srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x88); // abcd ijkl efgh mnop + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x88); // abcd ijkl efgh mnop srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // abcd efgh ijkl mnop srcmm0 = _mm512_and_si512(srcmm0, parse_mask); @@ -483,12 +494,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -510,13 +522,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } void RleDecoderV2::unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 3; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint32_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -528,7 +540,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -539,9 +552,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -562,9 +578,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -573,15 +590,15 @@ namespace orc { if (numElements >= 64) { __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable3u); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable3u); - __m512i shuffleIdxPtr[2]; + __m512i shuffleIdxPtr[2]; shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable3u_0); shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable3u_1); - __m512i shiftMaskPtr[2]; + __m512i shiftMaskPtr[2]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable3u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable3u_1); @@ -617,12 +634,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -644,13 +662,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len){ + void RleDecoderV2::unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 4; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -662,7 +680,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -673,9 +692,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -696,9 +718,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -706,8 +729,8 @@ namespace orc { } if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements) - __m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0 + __mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements) + __m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0 while (numElements >= 64) { __m512i srcmm0, srcmm1, tmpmm; @@ -717,9 +740,9 @@ namespace orc { // move elements into their places // srcmm0: a c e g 0 0 0 0 // srcmm1: b d f h 0 0 0 0 - tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 + tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // cd gh 00 00 - srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x44); // ab ef cd gh + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x44); // ab ef cd gh srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // ab cd ef gh // turn 4 bitWidth into 8 by zeroing 4 of each 8 bits. @@ -739,12 +762,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -766,13 +790,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len){ + void RleDecoderV2::unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 5; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -784,7 +808,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -795,9 +820,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -818,9 +846,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -829,15 +858,15 @@ namespace orc { if (numElements >= 64) { __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable5u); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable5u); - __m512i shuffleIdxPtr[2]; + __m512i shuffleIdxPtr[2]; shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable5u_0); shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable5u_1); - __m512i shiftMaskPtr[2]; + __m512i shiftMaskPtr[2]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable5u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable5u_1); @@ -873,12 +902,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -900,13 +930,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector6(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector6(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 6; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -918,7 +948,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -929,9 +960,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -952,9 +986,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -963,15 +998,15 @@ namespace orc { if (numElements >= 64) { __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable6u); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable6u); - __m512i shuffleIdxPtr[2]; + __m512i shuffleIdxPtr[2]; shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable6u_0); shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable6u_1); - __m512i shiftMaskPtr[2]; + __m512i shiftMaskPtr[2]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable6u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable6u_1); @@ -1007,12 +1042,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -1034,13 +1070,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector7(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector7(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 7; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -1052,7 +1088,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH , ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -1063,9 +1100,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -1086,9 +1126,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -1097,15 +1138,15 @@ namespace orc { if (numElements >= 64) { __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable7u); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable7u); - __m512i shuffleIdxPtr[2]; + __m512i shuffleIdxPtr[2]; shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable7u_0); shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable7u_1); - __m512i shiftMaskPtr[2]; + __m512i shiftMaskPtr[2]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable7u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable7u_1); @@ -1141,12 +1182,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -1168,13 +1210,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector9(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector9(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 9; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -1186,7 +1228,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -1197,9 +1240,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -1220,9 +1266,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -1231,23 +1278,23 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable9u_0); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable9u_0); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable9u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable9u_1); - __m512i shiftMaskPtr[3]; + __m512i shiftMaskPtr[3]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable9u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable9u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable9u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable9u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable9u); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -1324,12 +1371,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -1351,13 +1399,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector10(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector10(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 10; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -1369,7 +1417,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -1380,9 +1429,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -1403,9 +1455,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -1414,11 +1467,11 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable10u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable10u); - __m512i shiftMask = _mm512_load_si512(shiftTable10u); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable10u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable10u); + __m512i shiftMask = _mm512_load_si512(shiftTable10u); while (numElements >= 32) { __m512i srcmm, zmm; @@ -1446,12 +1499,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -1473,13 +1527,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector11(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector11(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 11; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -1491,7 +1545,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -1502,9 +1557,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -1525,9 +1583,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -1536,26 +1595,26 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr[2]; + __m512i shuffleIdxPtr[2]; shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable11u_0); shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable11u_1); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable11u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable11u_1); - __m512i shiftMaskPtr[4]; + __m512i shiftMaskPtr[4]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable11u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable11u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable11u_2); shiftMaskPtr[3] = _mm512_load_si512(shiftTable11u_3); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable11u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable11u); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -1638,12 +1697,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -1665,13 +1725,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector12(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector12(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 12; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -1683,7 +1743,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -1694,9 +1755,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -1717,9 +1781,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -1728,11 +1793,11 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable12u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable12u); - __m512i shiftMask = _mm512_load_si512(shiftTable12u); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable12u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable12u); + __m512i shiftMask = _mm512_load_si512(shiftTable12u); while (numElements >= 32) { __m512i srcmm, zmm; @@ -1760,12 +1825,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -1787,13 +1853,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector13(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector13(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 13; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -1805,7 +1871,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -1816,9 +1883,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -1839,9 +1909,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -1850,26 +1921,26 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr[2]; + __m512i shuffleIdxPtr[2]; shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable13u_0); shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable13u_1); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable13u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable13u_1); - __m512i shiftMaskPtr[4]; + __m512i shiftMaskPtr[4]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable13u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable13u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable13u_2); shiftMaskPtr[3] = _mm512_load_si512(shiftTable13u_3); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable13u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable13u); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -1952,12 +2023,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -1979,13 +2051,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector14(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector14(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 14; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -1997,7 +2069,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -2008,9 +2081,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -2031,9 +2107,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -2042,15 +2119,15 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i shuffleIdxPtr[2]; + __m512i shuffleIdxPtr[2]; shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable14u_0); shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable14u_1); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable14u); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable14u); - __m512i shiftMaskPtr[2]; + __m512i shiftMaskPtr[2]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable14u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable14u_1); @@ -2086,12 +2163,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -2113,13 +2191,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector15(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector15(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 15; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -2131,7 +2209,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -2142,9 +2221,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -2165,9 +2247,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -2176,26 +2259,26 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr[2]; + __m512i shuffleIdxPtr[2]; shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable15u_0); shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable15u_1); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable15u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable15u_1); - __m512i shiftMaskPtr[4]; + __m512i shiftMaskPtr[4]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable15u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable15u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable15u_2); shiftMaskPtr[3] = _mm512_load_si512(shiftTable15u_3); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable15u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable15u); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -2278,12 +2361,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -2305,13 +2389,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector16(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector16(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 16; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = len; uint64_t bufMoveByteLen = 0; uint64_t bufRestByteLen = bufferEnd - bufferStart; @@ -2358,7 +2442,7 @@ namespace orc { if (numElements > 0) { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); unrolledUnpack16(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -2366,10 +2450,11 @@ namespace orc { if (bufMoveByteLen <= bufRestByteLen) { resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); return; - } - + } + if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + ; unrolledUnpack16(dstPtr, 0, 1); dstPtr++; backupByteLen = 0; @@ -2380,13 +2465,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector17(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector17(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 17; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -2398,7 +2483,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -2409,9 +2495,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -2432,9 +2521,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -2443,23 +2533,23 @@ namespace orc { if (numElements >= 16) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable17u_0); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable17u_0); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable17u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable17u_1); - __m512i shiftMaskPtr[3]; + __m512i shiftMaskPtr[3]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable17u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable17u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable17u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable17u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable17u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -2536,12 +2626,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -2563,13 +2654,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector18(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector18(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 18; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -2581,7 +2672,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -2592,9 +2684,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -2615,9 +2710,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -2626,23 +2722,23 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable18u_0); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable18u_0); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable18u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable18u_1); - __m512i shiftMaskPtr[3]; + __m512i shiftMaskPtr[3]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable18u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable18u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable18u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable18u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable18u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -2719,12 +2815,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -2746,13 +2843,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector19(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector19(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 19; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -2764,7 +2861,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -2775,9 +2873,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -2798,9 +2899,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -2809,23 +2911,23 @@ namespace orc { if (numElements >= 16) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable19u_0); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable19u_0); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable19u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable19u_1); - __m512i shiftMaskPtr[3]; + __m512i shiftMaskPtr[3]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable19u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable19u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable19u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable19u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable19u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -2902,12 +3004,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -2929,13 +3032,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector20(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector20(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 20; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -2947,7 +3050,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -2958,9 +3062,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -2981,9 +3088,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -2992,11 +3100,11 @@ namespace orc { if (numElements >= 16u) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable20u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable20u); - __m512i shiftMask = _mm512_load_si512(shiftTable20u); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable20u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable20u); + __m512i shiftMask = _mm512_load_si512(shiftTable20u); while (numElements >= 16u) { __m512i srcmm, zmm; @@ -3024,12 +3132,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -3051,13 +3160,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector21(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector21(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 21; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -3069,7 +3178,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -3080,9 +3190,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -3103,9 +3216,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -3114,23 +3228,23 @@ namespace orc { if (numElements >= 16) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable21u_0); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable21u_0); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable21u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable21u_1); - __m512i shiftMaskPtr[3]; + __m512i shiftMaskPtr[3]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable21u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable21u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable21u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable21u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable21u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -3207,12 +3321,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -3234,13 +3349,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector22(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector22(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 22; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -3252,7 +3367,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -3263,9 +3379,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -3286,9 +3405,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -3297,23 +3417,23 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable22u_0); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable22u_0); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable22u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable22u_1); - __m512i shiftMaskPtr[3]; + __m512i shiftMaskPtr[3]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable22u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable22u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable22u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable22u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable22u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -3390,12 +3510,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -3417,13 +3538,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector23(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector23(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 23; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -3436,7 +3557,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -3447,14 +3569,17 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); resetBuf = true; } } @@ -3470,34 +3595,35 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; } } - + if (numElements >= 16) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable23u_0); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable23u_0); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable23u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable23u_1); - __m512i shiftMaskPtr[3]; + __m512i shiftMaskPtr[3]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable23u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable23u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable23u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable23u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable23u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -3574,12 +3700,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -3587,8 +3714,8 @@ namespace orc { if (bufMoveByteLen <= bufRestByteLen) { resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); return; - } - + } + if (backupByteLen != 0) { resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); @@ -3601,13 +3728,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector24(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector24(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 24; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -3636,7 +3763,7 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - + __m512i shuffleIdx = _mm512_load_si512(shuffleIdxTable24u_0); __m512i permutexIdx = _mm512_load_si512(permutexIdxTable24u); @@ -3663,7 +3790,7 @@ namespace orc { if (numElements > 0) { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); unrolledUnpack24(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -3671,10 +3798,11 @@ namespace orc { if (bufMoveByteLen <= bufRestByteLen) { resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); return; - } - + } + if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + ; unrolledUnpack24(dstPtr, 0, 1); dstPtr++; backupByteLen = 0; @@ -3685,13 +3813,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector26(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector26(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 26; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -3703,7 +3831,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -3714,9 +3843,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -3737,9 +3869,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + bufMoveByteLen -= + (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -3748,23 +3881,23 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable26u_0); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable26u_0); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable26u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable26u_1); - __m512i shiftMaskPtr[3]; + __m512i shiftMaskPtr[3]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable26u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable26u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable26u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable26u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable26u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -3841,12 +3974,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -3868,13 +4002,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector28(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector28(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 28; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -3886,7 +4020,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -3897,9 +4032,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -3920,9 +4058,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + bufMoveByteLen -= + (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -3931,11 +4070,11 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable28u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable28u); - __m512i shiftMask = _mm512_load_si512(shiftTable28u); + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable28u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable28u); + __m512i shiftMask = _mm512_load_si512(shiftTable28u); while (numElements >= 16) { __m512i srcmm, zmm; @@ -3963,12 +4102,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -3990,13 +4130,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector30(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector30(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 30; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -4008,7 +4148,8 @@ namespace orc { while (len > 0) { if (startBit != 0) { - bufMoveByteLen += moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); } @@ -4019,9 +4160,12 @@ namespace orc { len -= numElements; } else { if (startBit != 0) { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); resetBuf = true; } else { numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; @@ -4042,9 +4186,10 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + bufMoveByteLen -= + (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); bufRestByteLen = bufferEnd - bufferStart; dstPtr += align; numElements -= align; @@ -4053,26 +4198,26 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr[2]; + __m512i shuffleIdxPtr[2]; shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable30u_0); shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable30u_1); - __m512i permutexIdxPtr[2]; + __m512i permutexIdxPtr[2]; permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable30u_0); permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable30u_1); - __m512i shiftMaskPtr[4]; + __m512i shiftMaskPtr[4]; shiftMaskPtr[0] = _mm512_load_si512(shiftTable30u_0); shiftMaskPtr[1] = _mm512_load_si512(shiftTable30u_1); shiftMaskPtr[2] = _mm512_load_si512(shiftTable30u_2); shiftMaskPtr[3] = _mm512_load_si512(shiftTable30u_3); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable30u); + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable30u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -4154,12 +4299,13 @@ namespace orc { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); } else { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -4181,13 +4327,13 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } - void RleDecoderV2::unrolledUnpackVector32(int64_t *data, uint64_t offset, uint64_t len) { + void RleDecoderV2::unrolledUnpackVector32(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 32; - const uint8_t *srcPtr = reinterpret_cast(bufferStart); + const uint8_t* srcPtr = reinterpret_cast(bufferStart); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; @@ -4230,11 +4376,11 @@ namespace orc { dstPtr += 16; } } - + if (numElements > 0) { bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); unrolledUnpack32(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); dstPtr += numElements; bufRestByteLen = bufferEnd - bufferStart; } @@ -4242,10 +4388,11 @@ namespace orc { if (bufMoveByteLen <= bufRestByteLen) { resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); return; - } - + } + if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen);; + resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + ; unrolledUnpack32(dstPtr, 0, 1); dstPtr++; backupByteLen = 0; @@ -4256,7 +4403,7 @@ namespace orc { bufRestByteLen = bufferEnd - bufferStart; bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + srcPtr = reinterpret_cast(bufferStart); } } #endif @@ -4535,8 +4682,8 @@ namespace orc { } } - void RleDecoderV2::plainUnpackLongs(int64_t *data, uint64_t offset, uint64_t len, - uint64_t fbs, uint64_t& startBit) { + void RleDecoderV2::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, + uint64_t& startBit) { for (uint64_t i = offset; i < (offset + len); i++) { uint64_t result = 0; uint64_t bitsLeftToRead = fbs; diff --git a/c++/src/VectorDecoder.hh b/c++/src/VectorDecoder.hh index c2629a3419..76f7cc6395 100644 --- a/c++/src/VectorDecoder.hh +++ b/c++/src/VectorDecoder.hh @@ -24,483 +24,457 @@ #include namespace orc { -#define ORC_VECTOR_BITS_2_BYTE(x) (((x) + 7u) >> 3u) /**< Convert a number of bits to a number of bytes */ -#define ORC_VECTOR_ONE_64U (1ULL) -#define ORC_VECTOR_MAX_16U 0xFFFF /**< Max value for uint16_t */ -#define ORC_VECTOR_MAX_32U 0xFFFFFFFF /**< Max value for uint32_t */ -#define ORC_VECTOR_BYTE_WIDTH 8u /**< Byte width in bits */ -#define ORC_VECTOR_WORD_WIDTH 16u /**< Word width in bits */ -#define ORC_VECTOR_DWORD_WIDTH 32u /**< Dword width in bits */ -#define ORC_VECTOR_QWORD_WIDTH 64u /**< Qword width in bits */ -#define ORC_VECTOR_BIT_MASK(x) ((ORC_VECTOR_ONE_64U << (x)) - 1u) /**< Bit mask below bit position */ - -#define ORC_VECTOR_BITS_2_WORD(x) (((x) + 15u) >> 4u) /**< Convert a number of bits to a number of words */ -#define ORC_VECTOR_BITS_2_DWORD(x) (((x) + 31u) >> 5u) /**< Convert a number of bits to a number of double words */ - -// ------------------------------------ 3u ----------------------------------------- -static uint8_t shuffleIdxTable3u_0[64] = { - 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, - 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, - 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, - 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u}; -static uint8_t shuffleIdxTable3u_1[64] = { - 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, - 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, - 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, - 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u}; -static uint16_t shiftTable3u_0[32] = { - 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u}; -static uint16_t shiftTable3u_1[32] = { - 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; -static uint16_t permutexIdxTable3u[32] = { - 0u, 1u, 2u, 0x0, 0x0, 0x0, 0x0, 0x0, 3u, 4u, 5u, 0x0, 0x0, 0x0, 0x0, 0x0, - 6u, 7u, 8u, 0x0, 0x0, 0x0, 0x0, 0x0, 9u, 10u, 11u, 0x0, 0x0, 0x0, 0x0, 0x0}; - -// ------------------------------------ 5u ----------------------------------------- -static uint8_t shuffleIdxTable5u_0[64] = { - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; -static uint8_t shuffleIdxTable5u_1[64] = { - 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, - 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, - 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, - 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u}; -static uint16_t shiftTable5u_0[32] = { - 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u}; -static uint16_t shiftTable5u_1[32] = { - 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u}; -static uint16_t permutexIdxTable5u[32] = { - 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, - 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; - -// ------------------------------------ 6u ----------------------------------------- -static uint8_t shuffleIdxTable6u_0[64] = { - 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, - 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, - 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, - 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; -static uint8_t shuffleIdxTable6u_1[64] = { - 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, - 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, - 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, - 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u}; -static uint16_t shiftTable6u_0[32] = { - 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u}; -static uint16_t shiftTable6u_1[32] = { - 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; -static uint32_t permutexIdxTable6u[16] = { - 0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; - -// ------------------------------------ 7u ----------------------------------------- -static uint8_t shuffleIdxTable7u_0[64] = { - 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, - 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, - 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, - 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u}; -static uint8_t shuffleIdxTable7u_1[64] = { - 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, - 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, - 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, - 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u}; -static uint16_t shiftTable7u_0[32] = { - 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u}; -static uint16_t shiftTable7u_1[32] = { - 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; -static uint16_t permutexIdxTable7u[32] = { - 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, - 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; - -// ------------------------------------ 9u ----------------------------------------- -static uint16_t permutexIdxTable9u_0[32] = { - 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 13u, 14u, 14u, 15u, 15u, 16u, 16u, 17u}; -static uint16_t permutexIdxTable9u_1[32] = { - 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u, 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 14u, 15u, 15u, 16u, 16u, 17u, 17u, 18u}; -static uint32_t shiftTable9u_0[16] = { - 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; -static uint32_t shiftTable9u_1[16] = { - 7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u}; - -static uint8_t shuffleIdxTable9u_0[64] = { - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u}; -static uint16_t shiftTable9u_2[32] = { - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -static uint64_t gatherIdxTable9u[8] = { - 0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; - -// ------------------------------------ 10u ----------------------------------------- -static uint8_t shuffleIdxTable10u_0[64] = { - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, - 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; -static uint16_t shiftTable10u[32] = { - 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; -static uint16_t permutexIdxTable10u[32] = { - 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, - 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; - -// ------------------------------------ 11u ----------------------------------------- -static uint16_t permutexIdxTable11u_0[32] = { - 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u, 12u, 13u, 13u, 14u, 15u, 16u, 16u, 17u, 17u, 18u, 19u, 20u, 20u, 21u}; -static uint16_t permutexIdxTable11u_1[32] = { - 0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u, 11u, 12u, 13u, 14u, 14u, 15u, 15u, 16u, 17u, 18u, 18u, 19u, 19u, 20u, 21u, 22u}; -static uint32_t shiftTable11u_0[16] = { - 0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u, 0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u}; -static uint32_t shiftTable11u_1[16] = { - 5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u, 5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u}; - -static uint8_t shuffleIdxTable11u_0[64] = { - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; -static uint8_t shuffleIdxTable11u_1[64] = { - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u}; -static uint32_t shiftTable11u_2[16] = { - 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u}; -static uint32_t shiftTable11u_3[16] = { - 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u}; -static uint64_t gatherIdxTable11u[8] = { - 0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; - -// ------------------------------------ 12u ----------------------------------------- -static uint8_t shuffleIdxTable12u_0[64] = { - 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, - 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, - 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, - 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; -static uint16_t shiftTable12u[32] = { - 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; -static uint32_t permutexIdxTable12u[16] = { - 0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; - -// ------------------------------------ 13u ----------------------------------------- -static uint16_t permutexIdxTable13u_0[32] = { - 0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u, - 13u, 14u, 14u, 15u, 16u, 17u, 17u, 18u, 19u, 20u, 21u, 22u, 22u, 23u, 24u, 25u}; -static uint16_t permutexIdxTable13u_1[32] = { - 0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u, - 13u, 14u, 15u, 16u, 17u, 18u, 18u, 19u, 20u, 21u, 21u, 22u, 23u, 24u, 25u, 26u}; -static uint32_t shiftTable13u_0[16] = { - 0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u, 0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u}; -static uint32_t shiftTable13u_1[16] = { - 3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u, 3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u}; - -static uint8_t shuffleIdxTable13u_0[64] = { - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; -static uint8_t shuffleIdxTable13u_1[64] = { - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u}; -static uint32_t shiftTable13u_2[16] = { - 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u}; -static uint32_t shiftTable13u_3[16] = { - 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u}; -static uint64_t gatherIdxTable13u[8] = { - 0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; - -// ------------------------------------ 14u ----------------------------------------- -static uint8_t shuffleIdxTable14u_0[64] = { - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; -static uint8_t shuffleIdxTable14u_1[64] = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u}; -static uint32_t shiftTable14u_0[16] = { - 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u}; -static uint32_t shiftTable14u_1[16] = { - 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; -static uint16_t permutexIdxTable14u[32] = { - 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, - 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; - -// ------------------------------------ 15u ----------------------------------------- -static uint16_t permutexIdxTable15u_0[32] = { - 0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, - 15u, 16u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u}; -static uint16_t permutexIdxTable15u_1[32] = { - 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, - 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u, 30u}; -static uint32_t shiftTable15u_0[16] = { - 0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u}; -static uint32_t shiftTable15u_1[16] = { - 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u, 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; - -static uint8_t shuffleIdxTable15u_0[64] = { - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u}; -static uint8_t shuffleIdxTable15u_1[64] = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u}; -static uint32_t shiftTable15u_2[16] = { - 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u}; -static uint32_t shiftTable15u_3[16] = { - 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; -static uint64_t gatherIdxTable15u[8] = { - 0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; - -// ------------------------------------ 17u ----------------------------------------- -static uint32_t permutexIdxTable17u_0[16] = { - 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; -static uint32_t permutexIdxTable17u_1[16] = { - 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; -static uint64_t shiftTable17u_0[8] = { - 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; -static uint64_t shiftTable17u_1[8] = { - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; - -static uint8_t shuffleIdxTable17u_0[64] = { - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; -static uint32_t shiftTable17u_2[16] = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u}; -static uint64_t gatherIdxTable17u[8] = { - 0u, 8u, 8u, 16u, 17u, 25u, 25u, 33u}; - -// ------------------------------------ 18u ----------------------------------------- -static uint32_t permutexIdxTable18u_0[16] = { - 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; -static uint32_t permutexIdxTable18u_1[16] = { - 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; -static uint64_t shiftTable18u_0[8] = { - 0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u}; -static uint64_t shiftTable18u_1[8] = { - 14u, 10u, 6u, 2u, 30u, 26u, 22u, 18u}; - -static uint8_t shuffleIdxTable18u_0[64] = { - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; -static uint32_t shiftTable18u_2[16] = { - 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; -static uint64_t gatherIdxTable18u[8] = { - 0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; - -// ------------------------------------ 19u ----------------------------------------- -static uint32_t permutexIdxTable19u_0[16] = { - 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u}; -static uint32_t permutexIdxTable19u_1[16] = { - 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; -static uint64_t shiftTable19u_0[8] = { - 0u, 6u, 12u, 18u, 24u, 30u, 4u, 10u}; -static uint64_t shiftTable19u_1[8] = { - 13u, 7u, 1u, 27u, 21u, 15u, 9u, 3u}; - -static uint8_t shuffleIdxTable19u_0[64] = { - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; -static uint32_t shiftTable19u_2[16] = { - 13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u, 13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u}; -static uint64_t gatherIdxTable19u[8] = { - 0u, 8u, 9u, 17u, 19u, 27u, 28u, 36u}; - -// ------------------------------------ 20u ----------------------------------------- -static uint8_t shuffleIdxTable20u_0[64] = { - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; -static uint32_t shiftTable20u[16] = { - 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; -static uint16_t permutexIdxTable20u[32] = { - 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, - 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; - -// ------------------------------------ 21u ----------------------------------------- -static uint32_t permutexIdxTable21u_0[16] = { - 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u}; -static uint32_t permutexIdxTable21u_1[16] = { - 0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 9u, 10u}; -static uint64_t shiftTable21u_0[8] = { - 0u, 10u, 20u, 30u, 8u, 18u, 28u, 6u}; -static uint64_t shiftTable21u_1[8] = { - 11u, 1u, 23u, 13u, 3u, 25u, 15u, 5u}; - -static uint8_t shuffleIdxTable21u_0[64] = { - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; -static uint32_t shiftTable21u_2[16] = { - 11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u, 11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u}; -static uint64_t gatherIdxTable21u[8] = { - 0u, 8u, 10u, 18u, 21u, 29u, 31u, 39u}; - -// ------------------------------------ 22u ----------------------------------------- -static uint32_t permutexIdxTable22u_0[16] = { - 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u}; -static uint32_t permutexIdxTable22u_1[16] = { - 0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u}; -static uint64_t shiftTable22u_0[8] = { - 0u, 12u, 24u, 4u, 16u, 28u, 8u, 20u}; -static uint64_t shiftTable22u_1[8] = { - 10u, 30u, 18u, 6u, 26u, 14u, 2u, 22u}; - -static uint8_t shuffleIdxTable22u_0[64] = { - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; -static uint32_t shiftTable22u_2[16] = { - 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u}; -static uint64_t gatherIdxTable22u[8] = { - 0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; - -// ------------------------------------ 23u ----------------------------------------- -static uint32_t permutexIdxTable23u_0[16] = { - 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u}; -static uint32_t permutexIdxTable23u_1[16] = { - 0u, 1u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u}; -static uint64_t shiftTable23u_0[8] = { - 0u, 14u, 28u, 10u, 24u, 6u, 20u, 2u}; -static uint64_t shiftTable23u_1[8] = { - 9u, 27u, 13u, 31u, 17u, 3u, 21u, 7u}; - -static uint8_t shuffleIdxTable23u_0[64] = { - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, - 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; -static uint32_t shiftTable23u_2[16] = { - 9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u}; -static uint64_t gatherIdxTable23u[8] = { - 0u, 8u, 11u, 19u, 23u, 31u, 34u, 42u}; - -// ------------------------------------ 24u ----------------------------------------- -static uint8_t shuffleIdxTable24u_0[64] = { - 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, - 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF}; -static uint32_t permutexIdxTable24u[16] = { - 0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; - -// ------------------------------------ 26u ----------------------------------------- -static uint32_t permutexIdxTable26u_0[16] = { - 0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u}; -static uint32_t permutexIdxTable26u_1[16] = { - 0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u}; -static uint64_t shiftTable26u_0[8] = { - 0u, 20u, 8u, 28u, 16u, 4u, 24u, 12u}; -static uint64_t shiftTable26u_1[8] = { - 6u, 18u, 30u, 10u, 22u, 2u, 14u, 26u}; - -static uint8_t shuffleIdxTable26u_0[64] = { - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; -static uint32_t shiftTable26u_2[16] = { - 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; -static uint64_t gatherIdxTable26u[8] = { - 0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; - -// ------------------------------------ 28u ----------------------------------------- -static uint8_t shuffleIdxTable28u_0[64] = { - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, - 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; -static uint32_t shiftTable28u[16] = { - 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; -static uint16_t permutexIdxTable28u[32] = { - 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, - 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; - -// ------------------------------------ 30u ----------------------------------------- -static uint32_t permutexIdxTable30u_0[16] = { - 0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u}; -static uint32_t permutexIdxTable30u_1[16] = { - 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u}; -static uint64_t shiftTable30u_0[8] = { - 0u, 28u, 24u, 20u, 16u, 12u, 8u, 4u}; -static uint64_t shiftTable30u_1[8] = { - 2u, 6u, 10u, 14u, 18u, 22u, 26u, 30u}; - -static uint8_t shuffleIdxTable30u_0[64] = { - 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, - 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u}; -static uint8_t shuffleIdxTable30u_1[64] = { - 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, - 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u}; -static uint64_t shiftTable30u_2[8] = { - 34u, 30u, 34u, 30u, 34u, 30u, 34u, 30u}; -static uint64_t shiftTable30u_3[8] = { - 28u, 24u, 28u, 24u, 28u, 24u, 28u, 24u}; -static uint64_t gatherIdxTable30u[8] = { - 0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; - -static uint64_t nibbleReverseTable[8] = { - 0x0E060A020C040800, - 0x0F070B030D050901, - 0x0E060A020C040800, - 0x0F070B030D050901, - 0x0E060A020C040800, - 0x0F070B030D050901, - 0x0E060A020C040800, - 0x0F070B030D050901 -}; - -static uint64_t reverseMaskTable1u[8] = { - 0x0001020304050607, - 0x08090A0B0C0D0E0F, - 0x1011121314151617, - 0x18191A1B1C1D1E1F, - 0x2021222324252627, - 0x28292A2B2C2D2E2F, - 0x3031323334353637, - 0x38393A3B3C3D3E3F -}; - -static uint64_t reverseMaskTable16u[8] = { - 0x0607040502030001, - 0x0E0F0C0D0A0B0809, - 0x1617141512131011, - 0x1E1F1C1D1A1B1819, - 0x2627242522232021, - 0x2E2F2C2D2A2B2829, - 0x3637343532333031, - 0x3E3F3C3D3A3B3839 -}; - -static uint64_t reverseMaskTable32u[8] = { - 0x0405060700010203, - 0x0C0D0E0F08090A0B, - 0x1415161710111213, - 0x1C1D1E1F18191A1B, - 0x2425262720212223, - 0x2C2D2E2F28292A2B, - 0x3435363730313233, - 0x3C3D3E3F38393A3B -}; - -uint32_t getAlign (uint32_t start_bit, uint32_t base, uint32_t bitsize) { +#define ORC_VECTOR_BITS_2_BYTE(x) \ + (((x) + 7u) >> 3u) /**< Convert a number of bits to a number of bytes */ +#define ORC_VECTOR_ONE_64U (1ULL) +#define ORC_VECTOR_MAX_16U 0xFFFF /**< Max value for uint16_t */ +#define ORC_VECTOR_MAX_32U 0xFFFFFFFF /**< Max value for uint32_t */ +#define ORC_VECTOR_BYTE_WIDTH 8u /**< Byte width in bits */ +#define ORC_VECTOR_WORD_WIDTH 16u /**< Word width in bits */ +#define ORC_VECTOR_DWORD_WIDTH 32u /**< Dword width in bits */ +#define ORC_VECTOR_QWORD_WIDTH 64u /**< Qword width in bits */ +#define ORC_VECTOR_BIT_MASK(x) \ + ((ORC_VECTOR_ONE_64U << (x)) - 1u) /**< Bit mask below bit position */ + +#define ORC_VECTOR_BITS_2_WORD(x) \ + (((x) + 15u) >> 4u) /**< Convert a number of bits to a number of words */ +#define ORC_VECTOR_BITS_2_DWORD(x) \ + (((x) + 31u) >> 5u) /**< Convert a number of bits to a number of double words */ + + // ------------------------------------ 3u ----------------------------------------- + static uint8_t shuffleIdxTable3u_0[64] = { + 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, + 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, + 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u}; + static uint8_t shuffleIdxTable3u_1[64] = { + 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, + 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, + 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u}; + static uint16_t shiftTable3u_0[32] = {13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, + 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, + 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u}; + static uint16_t shiftTable3u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, + 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, + 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static uint16_t permutexIdxTable3u[32] = {0u, 1u, 2u, 0x0, 0x0, 0x0, 0x0, 0x0, 3u, 4u, 5u, + 0x0, 0x0, 0x0, 0x0, 0x0, 6u, 7u, 8u, 0x0, 0x0, 0x0, + 0x0, 0x0, 9u, 10u, 11u, 0x0, 0x0, 0x0, 0x0, 0x0}; + + // ------------------------------------ 5u ----------------------------------------- + static uint8_t shuffleIdxTable5u_0[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, + 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, + 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; + static uint8_t shuffleIdxTable5u_1[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, + 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, + 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u}; + static uint16_t shiftTable5u_0[32] = {11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, + 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, + 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u}; + static uint16_t shiftTable5u_1[32] = {2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, + 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, + 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u}; + static uint16_t permutexIdxTable5u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, + 8u, 9u, 0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0, + 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + + // ------------------------------------ 6u ----------------------------------------- + static uint8_t shuffleIdxTable6u_0[64] = { + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; + static uint8_t shuffleIdxTable6u_1[64] = { + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u}; + static uint16_t shiftTable6u_0[32] = {10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, + 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, + 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u}; + static uint16_t shiftTable6u_1[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, + 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; + static uint32_t permutexIdxTable6u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, + 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + + // ------------------------------------ 7u ----------------------------------------- + static uint8_t shuffleIdxTable7u_0[64] = { + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u}; + static uint8_t shuffleIdxTable7u_1[64] = { + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u}; + static uint16_t shiftTable7u_0[32] = {9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, + 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, + 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u}; + static uint16_t shiftTable7u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, + 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, + 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static uint16_t permutexIdxTable7u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, + 10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u, + 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + + // ------------------------------------ 9u ----------------------------------------- + static uint16_t permutexIdxTable9u_0[32] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, + 6u, 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u, 11u, 12u, + 12u, 13u, 13u, 14u, 14u, 15u, 15u, 16u, 16u, 17u}; + static uint16_t permutexIdxTable9u_1[32] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, + 7u, 7u, 8u, 8u, 9u, 9u, 10u, 10u, 11u, 11u, 12u, + 12u, 13u, 14u, 15u, 15u, 16u, 16u, 17u, 17u, 18u}; + static uint32_t shiftTable9u_0[16] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, + 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; + static uint32_t shiftTable9u_1[16] = {7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u, + 7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u}; + + static uint8_t shuffleIdxTable9u_0[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, + 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, + 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u}; + static uint16_t shiftTable9u_2[32] = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, + 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, + 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + static uint64_t gatherIdxTable9u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; + + // ------------------------------------ 10u ----------------------------------------- + static uint8_t shuffleIdxTable10u_0[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, + 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, + 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; + static uint16_t shiftTable10u[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, + 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, + 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static uint16_t permutexIdxTable10u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, + 8u, 9u, 0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0, + 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + + // ------------------------------------ 11u ----------------------------------------- + static uint16_t permutexIdxTable11u_0[32] = { + 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u, + 11u, 12u, 12u, 13u, 13u, 14u, 15u, 16u, 16u, 17u, 17u, 18u, 19u, 20u, 20u, 21u}; + static uint16_t permutexIdxTable11u_1[32] = { + 0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u, + 11u, 12u, 13u, 14u, 14u, 15u, 15u, 16u, 17u, 18u, 18u, 19u, 19u, 20u, 21u, 22u}; + static uint32_t shiftTable11u_0[16] = {0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u, + 0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u}; + static uint32_t shiftTable11u_1[16] = {5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u, + 5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u}; + + static uint8_t shuffleIdxTable11u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; + static uint8_t shuffleIdxTable11u_1[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u}; + static uint32_t shiftTable11u_2[16] = {21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, + 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u}; + static uint32_t shiftTable11u_3[16] = {6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, + 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u}; + static uint64_t gatherIdxTable11u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; + + // ------------------------------------ 12u ----------------------------------------- + static uint8_t shuffleIdxTable12u_0[64] = { + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; + static uint16_t shiftTable12u[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, + 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; + static uint32_t permutexIdxTable12u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, + 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + + // ------------------------------------ 13u ----------------------------------------- + static uint16_t permutexIdxTable13u_0[32] = { + 0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u, + 13u, 14u, 14u, 15u, 16u, 17u, 17u, 18u, 19u, 20u, 21u, 22u, 22u, 23u, 24u, 25u}; + static uint16_t permutexIdxTable13u_1[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u, + 13u, 14u, 15u, 16u, 17u, 18u, 18u, 19u, 20u, 21u, 21u, 22u, 23u, 24u, 25u, 26u}; + static uint32_t shiftTable13u_0[16] = {0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u, + 0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u}; + static uint32_t shiftTable13u_1[16] = {3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u, + 3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u}; + + static uint8_t shuffleIdxTable13u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; + static uint8_t shuffleIdxTable13u_1[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u}; + static uint32_t shiftTable13u_2[16] = {19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, + 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u}; + static uint32_t shiftTable13u_3[16] = {10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, + 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u}; + static uint64_t gatherIdxTable13u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; + + // ------------------------------------ 14u ----------------------------------------- + static uint8_t shuffleIdxTable14u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; + static uint8_t shuffleIdxTable14u_1[64] = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u}; + static uint32_t shiftTable14u_0[16] = {18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, + 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u}; + static uint32_t shiftTable14u_1[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, + 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; + static uint16_t permutexIdxTable14u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, + 10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u, + 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + + // ------------------------------------ 15u ----------------------------------------- + static uint16_t permutexIdxTable15u_0[32] = { + 0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, + 15u, 16u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u}; + static uint16_t permutexIdxTable15u_1[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, + 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u, 30u}; + static uint32_t shiftTable15u_0[16] = {0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u, + 0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u}; + static uint32_t shiftTable15u_1[16] = {1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u, + 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; + + static uint8_t shuffleIdxTable15u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u}; + static uint8_t shuffleIdxTable15u_1[64] = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u}; + static uint32_t shiftTable15u_2[16] = {17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, + 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u}; + static uint32_t shiftTable15u_3[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, + 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; + static uint64_t gatherIdxTable15u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; + + // ------------------------------------ 17u ----------------------------------------- + static uint32_t permutexIdxTable17u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; + static uint32_t permutexIdxTable17u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; + static uint64_t shiftTable17u_0[8] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; + static uint64_t shiftTable17u_1[8] = {15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; + + static uint8_t shuffleIdxTable17u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, + 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, + 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; + static uint32_t shiftTable17u_2[16] = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u}; + static uint64_t gatherIdxTable17u[8] = {0u, 8u, 8u, 16u, 17u, 25u, 25u, 33u}; + + // ------------------------------------ 18u ----------------------------------------- + static uint32_t permutexIdxTable18u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; + static uint32_t permutexIdxTable18u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; + static uint64_t shiftTable18u_0[8] = {0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u}; + static uint64_t shiftTable18u_1[8] = {14u, 10u, 6u, 2u, 30u, 26u, 22u, 18u}; + + static uint8_t shuffleIdxTable18u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, + 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, + 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; + static uint32_t shiftTable18u_2[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, + 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; + static uint64_t gatherIdxTable18u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; + + // ------------------------------------ 19u ----------------------------------------- + static uint32_t permutexIdxTable19u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u}; + static uint32_t permutexIdxTable19u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, + 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; + static uint64_t shiftTable19u_0[8] = {0u, 6u, 12u, 18u, 24u, 30u, 4u, 10u}; + static uint64_t shiftTable19u_1[8] = {13u, 7u, 1u, 27u, 21u, 15u, 9u, 3u}; + + static uint8_t shuffleIdxTable19u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, + 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, + 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; + static uint32_t shiftTable19u_2[16] = {13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u, + 13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u}; + static uint64_t gatherIdxTable19u[8] = {0u, 8u, 9u, 17u, 19u, 27u, 28u, 36u}; + + // ------------------------------------ 20u ----------------------------------------- + static uint8_t shuffleIdxTable20u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, + 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, + 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; + static uint32_t shiftTable20u[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, + 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; + static uint16_t permutexIdxTable20u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, + 8u, 9u, 0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0, + 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + + // ------------------------------------ 21u ----------------------------------------- + static uint32_t permutexIdxTable21u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u}; + static uint32_t permutexIdxTable21u_1[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, + 5u, 6u, 7u, 8u, 8u, 9u, 9u, 10u}; + static uint64_t shiftTable21u_0[8] = {0u, 10u, 20u, 30u, 8u, 18u, 28u, 6u}; + static uint64_t shiftTable21u_1[8] = {11u, 1u, 23u, 13u, 3u, 25u, 15u, 5u}; + + static uint8_t shuffleIdxTable21u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, + 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, + 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; + static uint32_t shiftTable21u_2[16] = {11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u, + 11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u}; + static uint64_t gatherIdxTable21u[8] = {0u, 8u, 10u, 18u, 21u, 29u, 31u, 39u}; + + // ------------------------------------ 22u ----------------------------------------- + static uint32_t permutexIdxTable22u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, + 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u}; + static uint32_t permutexIdxTable22u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, + 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u}; + static uint64_t shiftTable22u_0[8] = {0u, 12u, 24u, 4u, 16u, 28u, 8u, 20u}; + static uint64_t shiftTable22u_1[8] = {10u, 30u, 18u, 6u, 26u, 14u, 2u, 22u}; + + static uint8_t shuffleIdxTable22u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; + static uint32_t shiftTable22u_2[16] = {10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, + 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u}; + static uint64_t gatherIdxTable22u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; + + // ------------------------------------ 23u ----------------------------------------- + static uint32_t permutexIdxTable23u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, + 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u}; + static uint32_t permutexIdxTable23u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 5u, 6u, + 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u}; + static uint64_t shiftTable23u_0[8] = {0u, 14u, 28u, 10u, 24u, 6u, 20u, 2u}; + static uint64_t shiftTable23u_1[8] = {9u, 27u, 13u, 31u, 17u, 3u, 21u, 7u}; + + static uint8_t shuffleIdxTable23u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; + static uint32_t shiftTable23u_2[16] = {9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, + 9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u}; + static uint64_t gatherIdxTable23u[8] = {0u, 8u, 11u, 19u, 23u, 31u, 34u, 42u}; + + // ------------------------------------ 24u ----------------------------------------- + static uint8_t shuffleIdxTable24u_0[64] = { + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF}; + static uint32_t permutexIdxTable24u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, + 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + + // ------------------------------------ 26u ----------------------------------------- + static uint32_t permutexIdxTable26u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, + 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u}; + static uint32_t permutexIdxTable26u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, + 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u}; + static uint64_t shiftTable26u_0[8] = {0u, 20u, 8u, 28u, 16u, 4u, 24u, 12u}; + static uint64_t shiftTable26u_1[8] = {6u, 18u, 30u, 10u, 22u, 2u, 14u, 26u}; + + static uint8_t shuffleIdxTable26u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; + static uint32_t shiftTable26u_2[16] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, + 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static uint64_t gatherIdxTable26u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; + + // ------------------------------------ 28u ----------------------------------------- + static uint8_t shuffleIdxTable28u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; + static uint32_t shiftTable28u[16] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; + static uint16_t permutexIdxTable28u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, + 10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u, + 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + + // ------------------------------------ 30u ----------------------------------------- + static uint32_t permutexIdxTable30u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, + 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u}; + static uint32_t permutexIdxTable30u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, + 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u}; + static uint64_t shiftTable30u_0[8] = {0u, 28u, 24u, 20u, 16u, 12u, 8u, 4u}; + static uint64_t shiftTable30u_1[8] = {2u, 6u, 10u, 14u, 18u, 22u, 26u, 30u}; + + static uint8_t shuffleIdxTable30u_0[64] = { + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u}; + static uint8_t shuffleIdxTable30u_1[64] = { + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u}; + static uint64_t shiftTable30u_2[8] = {34u, 30u, 34u, 30u, 34u, 30u, 34u, 30u}; + static uint64_t shiftTable30u_3[8] = {28u, 24u, 28u, 24u, 28u, 24u, 28u, 24u}; + static uint64_t gatherIdxTable30u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; + + static uint64_t nibbleReverseTable[8] = { + 0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 0x0F070B030D050901, + 0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 0x0F070B030D050901}; + + static uint64_t reverseMaskTable1u[8] = { + 0x0001020304050607, 0x08090A0B0C0D0E0F, 0x1011121314151617, 0x18191A1B1C1D1E1F, + 0x2021222324252627, 0x28292A2B2C2D2E2F, 0x3031323334353637, 0x38393A3B3C3D3E3F}; + + static uint64_t reverseMaskTable16u[8] = { + 0x0607040502030001, 0x0E0F0C0D0A0B0809, 0x1617141512131011, 0x1E1F1C1D1A1B1819, + 0x2627242522232021, 0x2E2F2C2D2A2B2829, 0x3637343532333031, 0x3E3F3C3D3A3B3839}; + + static uint64_t reverseMaskTable32u[8] = { + 0x0405060700010203, 0x0C0D0E0F08090A0B, 0x1415161710111213, 0x1C1D1E1F18191A1B, + 0x2425262720212223, 0x2C2D2E2F28292A2B, 0x3435363730313233, 0x3C3D3E3F38393A3B}; + + uint32_t getAlign(uint32_t start_bit, uint32_t base, uint32_t bitsize) { uint32_t remnant = bitsize - start_bit; uint32_t ret_value = 0xFFFFFFFF; for (uint32_t i = 0u; i < bitsize; ++i) { - uint32_t test_value = (i * base) % bitsize; - if (test_value == remnant) { - ret_value = i; - break; - } + uint32_t test_value = (i * base) % bitsize; + if (test_value == remnant) { + ret_value = i; + break; + } } return ret_value; -} + } -inline uint64_t moveLen(uint64_t x, uint64_t y) { + inline uint64_t moveLen(uint64_t x, uint64_t y) { uint64_t result = 0; if (x % y == 0) { - result = x / y; + result = x / y; } else { - result = x / y + 1; + result = x / y + 1; } return result; -} -} // namespace orc + } +} // namespace orc #endif #endif From 6f8cb56bbd0b86a519b735da973bc0c8c0126d66 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Tue, 31 Jan 2023 14:33:56 +0530 Subject: [PATCH 18/80] 1.Add an Env parameter "ENABLE_RUNTIME_AVX512" to open or close AVX512 at the runtime 2.Delete Macro ENABLE_AVX512, use Macro ORC_HAVE_RUNTIME_AVX512 to enable AVX512 feature in the build process 3.Fomat some parameters names. --- CMakeLists.txt | 21 +++++++++------------ c++/src/DetectPlatform.hh | 20 ++++++++++---------- c++/src/RLEv2.hh | 4 ++-- c++/src/RleDecoderV2.cc | 8 +++++--- c++/src/VectorDecoder.hh | 10 +++++++++- c++/test/TestRleVectorDecoder.cc | 2 +- 6 files changed, 36 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 702c5dcb33..bbaf7482e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ option(BUILD_CPP_ENABLE_METRICS option(BUILD_ENABLE_AVX512 "Enable AVX512 vector decode of bit-packing" - OFF) + ON) # Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) @@ -240,20 +240,25 @@ if(ORC_CPU_FLAG STREQUAL "x86") if(CXX_SUPPORTS_SSE4_2 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(SSE4_2|AVX2|AVX512|MAX)$") set(ORC_HAVE_RUNTIME_SSE4_2 ON) + set(ORC_SIMD_LEVEL "SSE4_2") add_definitions(-DORC_HAVE_RUNTIME_SSE4_2) endif() if(CXX_SUPPORTS_AVX2 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(AVX2|AVX512|MAX)$") set(ORC_HAVE_RUNTIME_AVX2 ON) + set(ORC_SIMD_LEVEL "AVX2") add_definitions(-DORC_HAVE_RUNTIME_AVX2 -DORC_HAVE_RUNTIME_BMI2) endif() - if(CXX_SUPPORTS_AVX512 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(AVX512|MAX)$") + if(BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(AVX512|MAX)$") + message(STATUS "Enable the AVX512 vector decode of bit-packing") set(ORC_HAVE_RUNTIME_AVX512 ON) + set(ORC_SIMD_LEVEL "AVX512") add_definitions(-DORC_HAVE_RUNTIME_AVX512 -DORC_HAVE_RUNTIME_BMI2) + else () + message(STATUS "Disable the AVX512 vector decode of bit-packing") endif() if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") - set(ORC_SIMD_LEVEL "AVX512") + set(ORC_SIMD_LEVEL "NONE") endif() - elseif(ORC_CPU_FLAG STREQUAL "ppc") # power compiler flags, gcc/clang only set(ORC_ALTIVEC_FLAG "-maltivec") @@ -308,14 +313,6 @@ else () add_compile_definitions(ENABLE_METRICS=0) endif () -if (BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512 AND ORC_SIMD_LEVEL STREQUAL "AVX512") - message(STATUS "Enable the AVX512 vector decode of bit-packing") - add_compile_definitions(ENABLE_AVX512=1) -else () - message(STATUS "Disable the AVX512 vector decode of bit-packing") - add_compile_definitions(ENABLE_AVX512=0) -endif () - enable_testing() INCLUDE(CheckSourceCompiles) diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh index 03fd158402..a1df00b1e8 100644 --- a/c++/src/DetectPlatform.hh +++ b/c++/src/DetectPlatform.hh @@ -59,29 +59,29 @@ namespace orc { enum class Arch { PX_ARCH = 0, AVX2_ARCH = 1, AVX512_ARCH = 2 }; Arch detectPlatform() { - Arch detected_platform = Arch::PX_ARCH; + Arch detectedPlatform = Arch::PX_ARCH; int cpuInfo[4]; cpuid(cpuInfo, 1); - bool avx512_support_cpu = cpuInfo[1] & CPUID_AVX512_MASK; - bool os_uses_XSAVE_XSTORE = cpuInfo[2] & EXC_OSXSAVE; + bool avx512SupportCpu = cpuInfo[1] & CPUID_AVX512_MASK; + bool osUsesXSaveXStore = cpuInfo[2] & EXC_OSXSAVE; - if (avx512_support_cpu && os_uses_XSAVE_XSTORE) { + if (avx512SupportCpu && osUsesXSaveXStore) { // Check if XMM state and YMM state are saved #ifdef _WIN32 - unsigned long long xcr_feature_mask = _xgetbv(0); /* min VS2010 SP1 compiler is required */ + unsigned long long xcrFeatureMask = _xgetbv(0); /* min VS2010 SP1 compiler is required */ #else - unsigned long long xcr_feature_mask = xgetbv(0); + unsigned long long xcrFeatureMask = xgetbv(0); #endif - if ((xcr_feature_mask & 0x6) == 0x6) { // AVX2 is supported now - if ((xcr_feature_mask & 0xe0) == 0xe0) { // AVX512 is supported now - detected_platform = Arch::AVX512_ARCH; + if ((xcrFeatureMask & 0x6) == 0x6) { // AVX2 is supported now + if ((xcrFeatureMask & 0xe0) == 0xe0) { // AVX512 is supported now + detectedPlatform = Arch::AVX512_ARCH; } } } - return detected_platform; + return detectedPlatform; } } // namespace orc diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index e87398d946..20ee402346 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -202,7 +202,7 @@ namespace orc { void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, uint64_t& startBit); -#if ENABLE_AVX512 +#if defined(ORC_HAVE_RUNTIME_AVX512) void unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len); void unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len); void unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len); @@ -265,7 +265,7 @@ namespace orc { uint32_t curByte; // Used by anything that uses readLongs DataBuffer unpackedPatch; // Used by PATCHED_BASE DataBuffer literals; // Values of the current run -#if ENABLE_AVX512 +#if defined(ORC_HAVE_RUNTIME_AVX512) uint8_t vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index c05a7cc1bf..30b36e8c48 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -92,8 +92,10 @@ namespace orc { void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { uint64_t startBit = 0; -#if ENABLE_AVX512 - if (detectPlatform() == Arch::AVX512_ARCH) { +#if defined(ORC_HAVE_RUNTIME_AVX512) + const auto runtimeEnable = getenv("ENABLE_RUNTIME_AVX512"); + std::string avxRuntimeEnable = runtimeEnable == nullptr ? "OFF" : std::string(runtimeEnable); + if (detectPlatform() == Arch::AVX512_ARCH && strcasecmp(avxRuntimeEnable.c_str(), "on") == 0) { switch (fbs) { case 1: unrolledUnpackVector1(data, offset, len); @@ -268,7 +270,7 @@ namespace orc { #endif } -#if ENABLE_AVX512 +#if defined(ORC_HAVE_RUNTIME_AVX512) void RleDecoderV2::unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 1; const uint8_t* srcPtr = reinterpret_cast(bufferStart); diff --git a/c++/src/VectorDecoder.hh b/c++/src/VectorDecoder.hh index 76f7cc6395..67cbb374fc 100644 --- a/c++/src/VectorDecoder.hh +++ b/c++/src/VectorDecoder.hh @@ -19,7 +19,15 @@ #ifndef VECTOR_DECODER_HH #define VECTOR_DECODER_HH -#if ENABLE_AVX512 +#include +// Mingw-w64 defines strcasecmp in string.h +#if defined(_WIN32) && !defined(strcasecmp) +#define strcasecmp stricmp +#else +#include +#endif + +#if defined(ORC_HAVE_RUNTIME_AVX512) #include #include diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index c3a3fc635e..6eb8a82121 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -141,7 +141,7 @@ namespace orc { delete[] notNull; } -#if ENABLE_AVX512 +#if defined(ORC_HAVE_RUNTIME_AVX512) TEST_P(RleVectorTest, RleV2_basic_vector_decode_1bit) { uint8_t bitWidth = 1; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { From c1c2448c813ae15e768f41bf52121709df207f06 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Wed, 1 Feb 2023 16:40:51 +0800 Subject: [PATCH 19/80] Update CMakeLists.txt Co-authored-by: Gang Wu --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bbaf7482e7..29c93ab7d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,7 +67,7 @@ option(BUILD_CPP_ENABLE_METRICS "Enable the metrics collection at compile phase" OFF) -option(BUILD_ENABLE_AVX512 +option(BUILD_CPP_AVX512 "Enable AVX512 vector decode of bit-packing" ON) From edf164fec5731d19fa2251f26f70e7f54286fea1 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Wed, 1 Feb 2023 16:41:23 +0800 Subject: [PATCH 20/80] Update CMakeLists.txt Co-authored-by: Gang Wu --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29c93ab7d3..9f9b39fac0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,7 @@ option(BUILD_CPP_ENABLE_METRICS OFF) option(BUILD_CPP_AVX512 - "Enable AVX512 vector decode of bit-packing" + "Enable build with AVX512 at compile time" ON) # Make sure that a build type is selected From 743ac84211e68026f78271d24a2cff6f6096e1e9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 13 Feb 2023 17:15:31 -0500 Subject: [PATCH 21/80] 1.Add the dynamic dispatch function to distribute avx512 and default unpacking. 2.delete the file c++/src/DetectPlatform.hh 3.Add the CpuInfoUtil.cc new file to check if current cpu supports avx512 --- CMakeLists.txt | 80 +- ...{VectorDecoder.hh => BitUnpackerAvx512.hh} | 2 +- c++/src/Bpacking.cc | 213 + c++/src/Bpacking.hh | 40 + c++/src/BpackingAvx512.cc | 4321 +++++++++++++++ c++/src/BpackingAvx512.hh | 94 + c++/src/BpackingDefault.cc | 337 ++ c++/src/BpackingDefault.hh | 69 + c++/src/CMakeLists.txt | 370 +- c++/src/CpuInfoUtil.cc | 600 +++ c++/src/CpuInfoUtil.hh | 110 + c++/src/DetectPlatform.hh | 88 - c++/src/Dispatch.hh | 109 + c++/src/RLEv2.hh | 66 +- c++/src/RleDecoderV2.cc | 4683 +---------------- c++/test/TestRleVectorDecoder.cc | 72 +- 16 files changed, 6228 insertions(+), 5026 deletions(-) rename c++/src/{VectorDecoder.hh => BitUnpackerAvx512.hh} (100%) create mode 100644 c++/src/Bpacking.cc create mode 100644 c++/src/Bpacking.hh create mode 100644 c++/src/BpackingAvx512.cc create mode 100644 c++/src/BpackingAvx512.hh create mode 100644 c++/src/BpackingDefault.cc create mode 100644 c++/src/BpackingDefault.hh create mode 100644 c++/src/CpuInfoUtil.cc create mode 100644 c++/src/CpuInfoUtil.hh delete mode 100644 c++/src/DetectPlatform.hh create mode 100644 c++/src/Dispatch.hh diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f9b39fac0..6ac39ddb83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,14 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +#Licensed under the Apache License, Version 2.0(the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +#http: // www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. cmake_minimum_required (VERSION 3.12.0) if (POLICY CMP0048) @@ -20,12 +20,13 @@ endif () project(ORC C CXX) -# Version number of package +#Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "1") SET(CPACK_PACKAGE_VERSION_MINOR "9") SET(CPACK_PACKAGE_VERSION_PATCH "0-SNAPSHOT") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake_modules") +set(CMAKE_MODULE_PATH ${ + CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake_modules") option (BUILD_JAVA "Include ORC Java library in the build process" @@ -71,7 +72,7 @@ option(BUILD_CPP_AVX512 "Enable build with AVX512 at compile time" ON) -# Make sure that a build type is selected +#Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) message(STATUS "No build type selected, default to ReleaseWithDebugInfo") set (CMAKE_BUILD_TYPE "RELWITHDEBINFO") @@ -79,7 +80,7 @@ else () message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") endif () -# Set the package format +#Set the package format SET(CPACK_GENERATOR "TGZ") SET(CPACK_PACKAGE_VENDOR "Apache ORC") SET(CPACK_PACKAGE_CONTACT "Apache ORC ") @@ -103,15 +104,15 @@ if(NOT DEFINED ORC_RUNTIME_SIMD_LEVEL) endif() # -# Compiler specific flags +#Compiler specific flags # -# This ensures that things like c++17 get passed correctly +#This ensures that things like c++ 17 get passed correctly if(NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) elseif(${CMAKE_CXX_STANDARD} VERSION_LESS 17) message(FATAL_ERROR "Cannot set a CMAKE_CXX_STANDARD smaller than 17") endif() -# We require a C++17 compliant compiler +#We require a C++ 17 compliant compiler set(CMAKE_CXX_STANDARD_REQUIRED ON) if (NOT MSVC) set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-omit-frame-pointer") @@ -194,9 +195,9 @@ if(NOT DEFINED ORC_CPU_FLAG) endif() endif() -# Check architecture specific compiler flags +#Check architecture specific compiler flags if(ORC_CPU_FLAG STREQUAL "x86") - # x86/amd64 compiler flags, msvc/gcc/clang +#x86 / amd64 compiler flags, msvc / gcc / clang if(MSVC) set(ORC_SSE4_2_FLAG "") set(ORC_AVX2_FLAG "/arch:AVX2") @@ -205,38 +206,37 @@ if(ORC_CPU_FLAG STREQUAL "x86") else() set(ORC_SSE4_2_FLAG "-msse4.2") set(ORC_AVX2_FLAG "-march=haswell") - # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ +#skylake - avx512 consists of AVX512F, AVX512BW, AVX512VL, AVX512CD, AVX512DQ set(ORC_AVX512_FLAG "-march=native -mbmi2") - # Append the avx2/avx512 subset option also, fix issue ORC-9877 for homebrew-cpp set(ORC_AVX2_FLAG "${ORC_AVX2_FLAG} -mavx2") set(ORC_AVX512_FLAG "${ORC_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi") endif() check_cxx_compiler_flag(${ORC_AVX512_FLAG} CXX_SUPPORTS_AVX512) if(MINGW) - # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 +#https: // gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 message(STATUS "Disable AVX512 support on MINGW for now") else() - # Check for AVX512 support in the compiler. +#Check for AVX512 support in the compiler. set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ORC_AVX512_FLAG}") check_cxx_source_compiles(" - #ifdef _MSC_VER - #include - #else - #include - #endif +#ifdef _MSC_VER +#include +#else +#include +#endif int main() { - __m512i mask = _mm512_set1_epi32(0x1); - char out[32]; - _mm512_storeu_si512(out, mask); - return 0; + __m512i mask = _mm512_set1_epi32(0x1); + char out[32]; + _mm512_storeu_si512(out, mask); + return 0; }" CXX_SUPPORTS_AVX512) set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) endif() - # Runtime SIMD level it can get from compiler and ORC_RUNTIME_SIMD_LEVEL +#Runtime SIMD level it can get from compiler and ORC_RUNTIME_SIMD_LEVEL if(CXX_SUPPORTS_SSE4_2 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(SSE4_2|AVX2|AVX512|MAX)$") set(ORC_HAVE_RUNTIME_SSE4_2 ON) @@ -260,14 +260,14 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(ORC_SIMD_LEVEL "NONE") endif() elseif(ORC_CPU_FLAG STREQUAL "ppc") - # power compiler flags, gcc/clang only +#power compiler flags, gcc / clang only set(ORC_ALTIVEC_FLAG "-maltivec") check_cxx_compiler_flag(${ORC_ALTIVEC_FLAG} CXX_SUPPORTS_ALTIVEC) if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") set(ORC_SIMD_LEVEL "NONE") endif() elseif(ORC_CPU_FLAG STREQUAL "aarch64") - # Arm64 compiler flags, gcc/clang only +#Arm64 compiler flags, gcc / clang only set(ORC_ARMV8_MARCH "armv8-a") check_cxx_compiler_flag("-march=${ORC_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE) if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") @@ -275,10 +275,10 @@ elseif(ORC_CPU_FLAG STREQUAL "aarch64") endif() endif() -# Only enable additional instruction sets if they are supported +#Only enable additional instruction sets if they are supported if(ORC_CPU_FLAG STREQUAL "x86") if(MINGW) - # Enable _xgetbv() intrinsic to query OS support for ZMM register saves +#Enable _xgetbv() intrinsic to query OS support for ZMM register saves set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mxsave") endif() if(ORC_SIMD_LEVEL STREQUAL "AVX512") @@ -335,15 +335,17 @@ if (BUILD_TOOLS) endif () if (BUILD_CPP_TESTS) - # Add another target called test-out that prints the results on failure +#Add another target called test - out that prints the results on failure if (CMAKE_CONFIGURATION_TYPES) add_custom_target (test-out - COMMAND ${CMAKE_CTEST_COMMAND} --force-new-ctest-process + COMMAND ${ + CMAKE_CTEST_COMMAND} --force-new-ctest-process --output-on-failure --build-config "$" ) else () add_custom_target (test-out - COMMAND ${CMAKE_CTEST_COMMAND} --force-new-ctest-process + COMMAND ${ + CMAKE_CTEST_COMMAND} --force-new-ctest-process --output-on-failure ) endif () diff --git a/c++/src/VectorDecoder.hh b/c++/src/BitUnpackerAvx512.hh similarity index 100% rename from c++/src/VectorDecoder.hh rename to c++/src/BitUnpackerAvx512.hh index 67cbb374fc..688acc728e 100644 --- a/c++/src/VectorDecoder.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -19,9 +19,9 @@ #ifndef VECTOR_DECODER_HH #define VECTOR_DECODER_HH -#include // Mingw-w64 defines strcasecmp in string.h #if defined(_WIN32) && !defined(strcasecmp) +#include #define strcasecmp stricmp #else #include diff --git a/c++/src/Bpacking.cc b/c++/src/Bpacking.cc new file mode 100644 index 0000000000..39993d2d52 --- /dev/null +++ b/c++/src/Bpacking.cc @@ -0,0 +1,213 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Bpacking.hh" +#include "CpuInfoUtil.hh" + +namespace orc { + int readLongsDefault(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs) { + UnpackDefault unpackDefault(decoder); + switch (fbs) { + case 4: + unpackDefault.unrolledUnpack4(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 16: + unpackDefault.unrolledUnpack16(data, offset, len); + break; + case 24: + unpackDefault.unrolledUnpack24(data, offset, len); + break; + case 32: + unpackDefault.unrolledUnpack32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackDefault.plainUnpackLongs(data, offset, len, fbs); + break; + } + return 0; + } + +#if defined(ORC_HAVE_RUNTIME_AVX512) + // template + int readLongsAvx512(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs) { + UnpackAvx512 unpackAvx512(decoder); + UnpackDefault unpackDefault(decoder); + uint64_t startBit = 0; + static const auto cpu_info = orc::CpuInfo::GetInstance(); + if (cpu_info->IsSupported(CpuInfo::AVX512)) { + switch (fbs) { + case 1: + unpackAvx512.unrolledUnpackVector1(data, offset, len); + break; + case 2: + unpackAvx512.unrolledUnpackVector2(data, offset, len); + break; + case 3: + unpackAvx512.unrolledUnpackVector3(data, offset, len); + break; + case 4: + unpackAvx512.unrolledUnpackVector4(data, offset, len); + break; + case 5: + unpackAvx512.unrolledUnpackVector5(data, offset, len); + break; + case 6: + unpackAvx512.unrolledUnpackVector6(data, offset, len); + break; + case 7: + unpackAvx512.unrolledUnpackVector7(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 9: + unpackAvx512.unrolledUnpackVector9(data, offset, len); + break; + case 10: + unpackAvx512.unrolledUnpackVector10(data, offset, len); + break; + case 11: + unpackAvx512.unrolledUnpackVector11(data, offset, len); + break; + case 12: + unpackAvx512.unrolledUnpackVector12(data, offset, len); + break; + case 13: + unpackAvx512.unrolledUnpackVector13(data, offset, len); + break; + case 14: + unpackAvx512.unrolledUnpackVector14(data, offset, len); + break; + case 15: + unpackAvx512.unrolledUnpackVector15(data, offset, len); + break; + case 16: + unpackAvx512.unrolledUnpackVector16(data, offset, len); + break; + case 17: + unpackAvx512.unrolledUnpackVector17(data, offset, len); + break; + case 18: + unpackAvx512.unrolledUnpackVector18(data, offset, len); + break; + case 19: + unpackAvx512.unrolledUnpackVector19(data, offset, len); + break; + case 20: + unpackAvx512.unrolledUnpackVector20(data, offset, len); + break; + case 21: + unpackAvx512.unrolledUnpackVector21(data, offset, len); + break; + case 22: + unpackAvx512.unrolledUnpackVector22(data, offset, len); + break; + case 23: + unpackAvx512.unrolledUnpackVector23(data, offset, len); + break; + case 24: + unpackAvx512.unrolledUnpackVector24(data, offset, len); + break; + case 26: + unpackAvx512.unrolledUnpackVector26(data, offset, len); + break; + case 28: + unpackAvx512.unrolledUnpackVector28(data, offset, len); + break; + case 30: + unpackAvx512.unrolledUnpackVector30(data, offset, len); + break; + case 32: + unpackAvx512.unrolledUnpackVector32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackAvx512.plainUnpackLongs(data, offset, len, fbs, startBit); + break; + } + } else { + switch (fbs) { + case 4: + unpackDefault.unrolledUnpack4(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 16: + unpackDefault.unrolledUnpack16(data, offset, len); + break; + case 24: + unpackDefault.unrolledUnpack24(data, offset, len); + break; + case 32: + unpackDefault.unrolledUnpack32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackDefault.plainUnpackLongs(data, offset, len, fbs); + break; + } + } + + return 0; + } +#endif + +} // namespace orc diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh new file mode 100644 index 0000000000..0366d747b5 --- /dev/null +++ b/c++/src/Bpacking.hh @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BPACKING_HH +#define ORC_BPACKING_HH + +#include + +#include "BpackingDefault.hh" +#if defined(ORC_HAVE_RUNTIME_AVX512) +#include "BpackingAvx512.hh" +#endif + +namespace orc { + int readLongsDefault(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); + +#if defined(ORC_HAVE_RUNTIME_AVX512) + int readLongsAvx512(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); +#endif + +} // namespace orc + +#endif diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc new file mode 100644 index 0000000000..fbbda92c55 --- /dev/null +++ b/c++/src/BpackingAvx512.cc @@ -0,0 +1,4321 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BpackingAvx512.hh" +#include "BitUnpackerAvx512.hh" +#include "Utils.hh" + +namespace orc { + +#if defined(ORC_HAVE_RUNTIME_AVX512) + UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) + : decoder(dec), + unpackDefault(UnpackDefault(dec)), + bitsLeft(decoder->bitsLeft), + curByte(decoder->curByte) { + // PASS + } + + UnpackAvx512::~UnpackAvx512() { + // PASS + } + + void UnpackAvx512::unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 1; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint32_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __m512i reverseMask1u = _mm512_load_si512(reverseMaskTable1u); + while (numElements >= 64) { + uint64_t src_64 = *(uint64_t*)srcPtr; + // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF + __m512i srcmm = _mm512_movm_epi8(src_64); + // make 0x00 --> 0x00, 0xFF --> 0x01 + srcmm = _mm512_abs_epi8(srcmm); + srcmm = _mm512_shuffle_epi8(srcmm, reverseMask1u); + _mm512_storeu_si512(vectorBuf8, srcmm); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 2; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint32_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements) + __m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0 + while (numElements >= 64) { + __m512i srcmm3 = _mm512_maskz_loadu_epi8(readMask, srcPtr); + __m512i srcmm0, srcmm1, srcmm2, tmpmm; + + srcmm2 = _mm512_srli_epi16(srcmm3, 2); + srcmm1 = _mm512_srli_epi16(srcmm3, 4); + srcmm0 = _mm512_srli_epi16(srcmm3, 6); + + // turn 2 bitWidth into 8 by zeroing 3 of each 4 elements. + // move them into their places + // srcmm0: a e i m 0 0 0 0 0 0 0 0 0 0 0 0 + // srcmm1: b f j n 0 0 0 0 0 0 0 0 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 00 00 00 00 + srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // ij mn 00 00 00 00 00 00 + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x00); // ab ef ab ef ij mn ij mn + + // srcmm2: c g k o 0 0 0 0 0 0 0 0 0 0 0 0 + // srcmm3: d h l p 0 0 0 0 0 0 0 0 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm2, srcmm3); // cd gh 00 00 00 00 00 00 + srcmm1 = _mm512_unpackhi_epi8(srcmm2, srcmm3); // kl op 00 00 00 00 00 00 + srcmm1 = _mm512_shuffle_i64x2(tmpmm, srcmm1, 0x00); // cd gh cd gh kl op kl op + + tmpmm = _mm512_unpacklo_epi16(srcmm0, srcmm1); // abcd abcd ijkl ijkl + srcmm0 = _mm512_unpackhi_epi16(srcmm0, srcmm1); // efgh efgh mnop mnop + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x88); // abcd ijkl efgh mnop + srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // abcd efgh ijkl mnop + + srcmm0 = _mm512_and_si512(srcmm0, parse_mask); + + _mm512_storeu_si512(vectorBuf8, srcmm0); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 3; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint32_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable3u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable3u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable3u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable3u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable3u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 4; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements) + __m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0 + while (numElements >= 64) { + __m512i srcmm0, srcmm1, tmpmm; + + srcmm1 = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm0 = _mm512_srli_epi16(srcmm1, 4); + + // move elements into their places + // srcmm0: a c e g 0 0 0 0 + // srcmm1: b d f h 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 + srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // cd gh 00 00 + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x44); // ab ef cd gh + srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // ab cd ef gh + + // turn 4 bitWidth into 8 by zeroing 4 of each 8 bits. + srcmm0 = _mm512_and_si512(srcmm0, parseMask); + + _mm512_storeu_si512(vectorBuf8, srcmm0); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 5; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable5u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable5u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable5u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable5u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable5u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector6(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 6; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable6u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable6u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable6u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable6u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable6u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector7(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 7; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 8); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 64) { + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable7u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable7u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable7u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable7u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable7u_1); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(vectorBuf8, zmm[0]); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 8 * bitWidth; + numElements -= 64; + std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); + dstPtr += 64; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector9(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 9; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable9u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable9u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable9u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable9u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable9u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable9u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable9u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 7); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector10(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 10; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable10u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable10u); + __m512i shiftMask = _mm512_load_si512(shiftTable10u); + + while (numElements >= 32) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi16(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector11(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 11; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable11u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable11u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable11u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable11u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable11u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable11u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable11u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable11u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable11u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 5); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector12(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 12; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable12u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable12u); + __m512i shiftMask = _mm512_load_si512(shiftTable12u); + + while (numElements >= 32) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi16(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector13(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 13; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable13u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable13u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable13u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable13u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable13u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable13u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable13u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable13u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable13u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 3); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector14(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 14; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable14u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable14u_1); + + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable14u); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable14u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable14u_1); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector15(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 15; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 16); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 32) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable15u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable15u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable15u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable15u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable15u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable15u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable15u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable15u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable15u); + + while (numElements >= 64) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + if (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 1); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); + + _mm512_storeu_si512(vectorBuf16, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector16(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 16; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = len; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + int64_t* dstPtr = data + offset; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + } else { + numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (numElements >= 32) { + __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + while (numElements >= 32) { + __m512i srcmm = _mm512_loadu_si512(srcPtr); + srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u); + _mm512_storeu_si512(vectorBuf16, srcmm); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= 32; + std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + dstPtr += 32; + } + } + + if (numElements > 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + unpackDefault.unrolledUnpack16(dstPtr, 0, numElements); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + ; + unpackDefault.unrolledUnpack16(dstPtr, 0, 1); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector17(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 17; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable17u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable17u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable17u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable17u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable17u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable17u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable17u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 15); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector18(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 18; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable18u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable18u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable18u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable18u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable18u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable18u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable18u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 14); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector19(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 19; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable19u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable19u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable19u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable19u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable19u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable19u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable19u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 13); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector20(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 20; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0u) { + uint32_t align = getAlign(startBit, bitWidth, 32u); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16u) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable20u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable20u); + __m512i shiftMask = _mm512_load_si512(shiftTable20u); + + while (numElements >= 16u) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi32(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector21(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 21; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0u) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable21u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable21u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable21u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable21u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable21u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable21u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable21u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 11); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector22(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 22; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable22u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable22u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable22u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable22u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable22u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable22u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable22u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 10); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector23(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 23; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable23u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable23u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable23u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable23u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable23u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable23u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable23u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 9); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector24(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 24; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + } else { + numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + + __m512i shuffleIdx = _mm512_load_si512(shuffleIdxTable24u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable24u); + + while (numElements >= 16) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdx); + + _mm512_storeu_si512(vectorBuf32, zmm); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + unpackDefault.unrolledUnpack24(dstPtr, 0, numElements); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + ; + unpackDefault.unrolledUnpack24(dstPtr, 0, 1); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector26(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 26; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable26u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable26u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable26u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable26u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable26u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable26u_2); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable26u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 6); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector28(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 28; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable28u_0); + __m512i permutexIdx = _mm512_load_si512(permutexIdxTable28u); + __m512i shiftMask = _mm512_load_si512(shiftTable28u); + + while (numElements >= 16) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi32(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector30(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 30; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / + bitWidth; + len -= numElements; + tailBitLen = fmod( + bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, 32); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + + if (numElements >= 16) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable30u_0); + shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable30u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable30u_0); + permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable30u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_load_si512(shiftTable30u_0); + shiftMaskPtr[1] = _mm512_load_si512(shiftTable30u_1); + shiftMaskPtr[2] = _mm512_load_si512(shiftTable30u_2); + shiftMaskPtr[3] = _mm512_load_si512(shiftTable30u_3); + + __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable30u); + + while (numElements >= 32) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + if (numElements >= 16) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 2u); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4u); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf32, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::unrolledUnpackVector32(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 32; + const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + } else { + numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (numElements >= 16) { + __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + while (numElements >= 16) { + __m512i srcmm = _mm512_loadu_si512(srcPtr); + srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u); + _mm512_storeu_si512(vectorBuf32, srcmm); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= 16; + std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); + dstPtr += 16; + } + } + + if (numElements > 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + unpackDefault.unrolledUnpack32(dstPtr, 0, numElements); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + ; + unpackDefault.unrolledUnpack32(dstPtr, 0, 1); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + } + + void UnpackAvx512::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, + uint64_t& startBit) { + for (uint64_t i = offset; i < (offset + len); i++) { + uint64_t result = 0; + uint64_t bitsLeftToRead = fbs; + while (bitsLeftToRead > bitsLeft) { + result <<= bitsLeft; + result |= curByte & ((1 << bitsLeft) - 1); + bitsLeftToRead -= bitsLeft; + curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + bitsLeft = 8; + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + bitsLeft -= static_cast(bitsLeftToRead); + result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + } + data[i] = static_cast(result); + startBit = bitsLeft == 0 ? 0 : (8 - bitsLeft); + } + } +#endif + +} // namespace orc diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh new file mode 100644 index 0000000000..e177210388 --- /dev/null +++ b/c++/src/BpackingAvx512.hh @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BPACKINGAVX512_HH +#define ORC_BPACKINGAVX512_HH + +#include +#include + +#include "BpackingDefault.hh" +#include "Dispatch.hh" +#include "RLEv2.hh" +#include "io/InputStream.hh" +#include "io/OutputStream.hh" + +namespace orc { + +#define MAX_VECTOR_BUF_8BIT_LENGTH 64 +#define MAX_VECTOR_BUF_16BIT_LENGTH 32 +#define MAX_VECTOR_BUF_32BIT_LENGTH 16 + +#if defined(ORC_HAVE_RUNTIME_AVX512) + class UnpackAvx512 { + public: + UnpackAvx512(RleDecoderV2* dec); + ~UnpackAvx512(); + + void unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector6(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector7(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector9(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector10(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector11(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector12(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector13(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector14(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector15(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector16(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector17(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector18(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector19(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector20(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector21(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector22(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector23(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector24(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector26(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector28(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector30(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpackVector32(int64_t* data, uint64_t offset, uint64_t len); + + void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, + uint64_t& startBit); + + private: + RleDecoderV2* decoder; + UnpackDefault unpackDefault; + + // char* bufferStart; + // char* bufferEnd; + uint32_t bitsLeft; + uint32_t curByte; + + uint8_t + vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data + uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + + 1]; // Used by vectorially 9~16 bit-unpacking data + uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + + 1]; // Used by vectorially 17~32 bit-unpacking data + }; +#endif + +} // namespace orc + +#endif diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc new file mode 100644 index 0000000000..821188b10f --- /dev/null +++ b/c++/src/BpackingDefault.cc @@ -0,0 +1,337 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BpackingDefault.hh" +#include "Utils.hh" + +namespace orc { + + UnpackDefault::UnpackDefault(RleDecoderV2* dec) + : decoder(dec), bitsLeft(decoder->bitsLeft), curByte(decoder->curByte) { + // PASS + } + + UnpackDefault::~UnpackDefault() { + // PASS + } + + void UnpackDefault::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. + while (bitsLeft > 0 && curIdx < offset + len) { + bitsLeft -= 4; + data[curIdx++] = (curByte >> bitsLeft) & 15; + } + if (curIdx == offset + len) return; + + // Exhaust the buffer + uint64_t numGroups = (offset + len - curIdx) / 2; + numGroups = + std::min(numGroups, static_cast(decoder->bufferEnd - decoder->bufferStart)); + // Avoid updating 'decoder->bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(decoder->bufferStart); + uint32_t localByte; + for (uint64_t i = 0; i < numGroups; ++i) { + localByte = *buffer++; + data[curIdx] = (localByte >> 4) & 15; + data[curIdx + 1] = localByte & 15; + curIdx += 2; + } + decoder->bufferStart = (char*)buffer; + if (curIdx == offset + len) return; + + // readByte() will update 'bufferStart' and 'bufferEnd' + curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + bitsLeft = 8; + } + } + + void UnpackDefault::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufferEnd - decoder->bufferStart; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + // Avoid updating 'bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(decoder->bufferStart); + for (int i = 0; i < bufferNum; ++i) { + data[curIdx++] = *buffer++; + } + decoder->bufferStart = (char*)buffer; + if (curIdx == offset + len) return; + + // readByte() will update 'bufferStart' and 'bufferEnd'. + data[curIdx++] = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + } + } + + void UnpackDefault::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 2; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint16_t b0, b1; + // Avoid updating 'decoder->bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(decoder->bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + buffer += 2; + data[curIdx++] = (b0 << 8) | b1; + } + decoder->bufferStart = (char*)buffer; + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'decoder->bufferStart' and + // 'decoder->bufferEnd'. + b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + data[curIdx++] = (b0 << 8) | b1; + } + } + + void UnpackDefault::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 3; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint32_t b0, b1, b2; + // Avoid updating 'decoder->bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(decoder->bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + buffer += 3; + data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); + } + decoder->bufferStart += bufferNum * 3; + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'decoder->bufferStart' and + // 'decoder->bufferEnd'. + b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); + } + } + + void UnpackDefault::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 4; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint32_t b0, b1, b2, b3; + // Avoid updating 'decoder->bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(decoder->bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + buffer += 4; + data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + } + decoder->bufferStart = (char*)buffer; + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'decoder->bufferStart' and + // 'decoder->bufferEnd'. + b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + } + } + + void UnpackDefault::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 5; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4; + // Avoid updating 'decoder->bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(decoder->bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + buffer += 5; + data[curIdx++] = + static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } + decoder->bufferStart = (char*)buffer; + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'decoder->bufferStart' and + // 'decoder->bufferEnd'. + b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b4 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } + } + + void UnpackDefault::unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 6; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5; + // Avoid updating 'decoder->bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(decoder->bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + buffer += 6; + data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | + (b4 << 8) | b5); + } + decoder->bufferStart = (char*)buffer; + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'decoder->bufferStart' and + // 'decoder->bufferEnd'. + b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b4 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b5 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + data[curIdx++] = + static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); + } + } + + void UnpackDefault::unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 7; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6; + // Avoid updating 'decoder->bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(decoder->bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + b6 = static_cast(*(buffer + 6)); + buffer += 7; + data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | + (b4 << 16) | (b5 << 8) | b6); + } + decoder->bufferStart = (char*)buffer; + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'decoder->bufferStart' and + // 'decoder->bufferEnd'. + b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b4 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b5 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b6 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | + (b4 << 16) | (b5 << 8) | b6); + } + } + + void UnpackDefault::unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 8; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6, b7; + // Avoid updating 'decoder->bufferStart' inside the loop. + const auto* buffer = reinterpret_cast(decoder->bufferStart); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + b6 = static_cast(*(buffer + 6)); + b7 = static_cast(*(buffer + 7)); + buffer += 8; + data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | + (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } + decoder->bufferStart = (char*)buffer; + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'decoder->bufferStart' and + // 'decoder->bufferEnd'. + b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b4 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b5 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b6 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b7 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | + (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } + } + + void UnpackDefault::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + for (uint64_t i = offset; i < (offset + len); i++) { + uint64_t result = 0; + uint64_t bitsLeftToRead = fbs; + while (bitsLeftToRead > bitsLeft) { + result <<= bitsLeft; + result |= curByte & ((1 << bitsLeft) - 1); + bitsLeftToRead -= bitsLeft; + curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + bitsLeft = 8; + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + bitsLeft -= static_cast(bitsLeftToRead); + result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + } + data[i] = static_cast(result); + } + } + +} // namespace orc diff --git a/c++/src/BpackingDefault.hh b/c++/src/BpackingDefault.hh new file mode 100644 index 0000000000..c3d37c4ee8 --- /dev/null +++ b/c++/src/BpackingDefault.hh @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BPACKINGDEFAULT_HH +#define ORC_BPACKINGDEFAULT_HH + +#include +#include + +// #include "Adaptor.hh" +#include "RLEv2.hh" +#include "io/InputStream.hh" +#include "io/OutputStream.hh" + +namespace orc { + + class UnpackDefault { + public: + UnpackDefault(RleDecoderV2* dec); + ~UnpackDefault(); + + void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len); + + void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + + /* void setBuf(char* bufStart, char* bufEnd) { + bufferStart = bufStart; + bufferEnd = bufEnd; + } + + void getBuf(char** bufStart, char** bufEnd) { + *bufStart = bufferStart; + *bufEnd = bufferEnd; + }*/ + + private: + RleDecoderV2* decoder; + // char* bufferStart; + // char* bufferEnd; + uint32_t bitsLeft; + uint32_t curByte; + }; + +} // namespace orc + +#endif diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index 0482079530..6aa6c1f1cc 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -1,110 +1,103 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +#Licensed under the Apache License, Version 2.0(the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +#http: // www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") -INCLUDE(CheckCXXSourceCompiles) + INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" - #include - #include - int main(int,char*[]){ + CHECK_CXX_SOURCE_COMPILES(" +#include +#include + int main(int, char*[]) { int f = open(\"/x/y\", O_RDONLY); char buf[100]; return pread(f, buf, 100, 1000) == 0; - }" - HAS_PREAD -) - -CHECK_CXX_SOURCE_COMPILES(" - #include - int main(int,char*[]){ - struct tm time2020; + } " + HAS_PREAD) + + CHECK_CXX_SOURCE_COMPILES(" +#include + int main(int, char*[]) { + struct tm time2020; return !strptime(\"2020-02-02 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2020); - }" - HAS_STRPTIME -) - -CHECK_CXX_SOURCE_COMPILES(" - int main(){ - int a; - return __builtin_add_overflow(1, 2, &a); - }" - HAS_BUILTIN_OVERFLOW_CHECK -) - -CHECK_CXX_SOURCE_COMPILES(" - #ifdef __clang__ - #pragma clang diagnostic push - #pragma clang diagnostic ignored \"-Wdeprecated\" - #pragma clang diagnostic pop - #elif defined(__GNUC__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored \"-Wdeprecated\" - #pragma GCC diagnostic pop - #elif defined(_MSC_VER) - #pragma warning( push ) - #pragma warning( disable : 4996 ) - #pragma warning( pop ) - #else + } " + HAS_STRPTIME) + + CHECK_CXX_SOURCE_COMPILES(" + int main() { + int a; + return __builtin_add_overflow(1, 2, &a); + } " + HAS_BUILTIN_OVERFLOW_CHECK) + + CHECK_CXX_SOURCE_COMPILES(" +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored \"-Wdeprecated\" +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored \"-Wdeprecated\" +#pragma GCC diagnostic pop +#elif defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#pragma warning(pop) +#else unknownCompiler! - #endif - int main(int, char *[]) {}" - HAS_DIAGNOSTIC_PUSH -) - -CHECK_CXX_SOURCE_COMPILES(" - #include - int main(int, char *[]) { - return std::isnan(1.0f); - }" - HAS_STD_ISNAN -) - -CHECK_CXX_SOURCE_COMPILES(" - #include - int main(int, char *[]) { - double d = 5; - std::to_string(d); - }" - HAS_DOUBLE_TO_STRING -) - -CHECK_CXX_SOURCE_COMPILES(" - #include - #include - int main(int, char *[]) { - int64_t d = 5; - std::to_string(d); - }" - HAS_INT64_TO_STRING -) - -INCLUDE(CheckCXXSourceRuns) - -CHECK_CXX_SOURCE_RUNS(" - #include - int main(int, char *[]) { - time_t t = -14210715; // 1969-07-20 12:34:45 - struct tm *ptm = gmtime(&t); - return !(ptm && ptm->tm_year == 69); - }" - HAS_PRE_1970 -) - -CHECK_CXX_SOURCE_RUNS(" - #include - #include - int main(int, char *[]) { +#endif + int main(int, char*[]) {} " + HAS_DIAGNOSTIC_PUSH) + + CHECK_CXX_SOURCE_COMPILES(" +#include + int main(int, char*[]) { + return std::isnan(1.0f); + } " + HAS_STD_ISNAN) + + CHECK_CXX_SOURCE_COMPILES(" +#include + int main(int, char*[]) { + double d = 5; + std::to_string(d); + } " + HAS_DOUBLE_TO_STRING) + + CHECK_CXX_SOURCE_COMPILES(" +#include +#include + int main(int, char*[]) { + int64_t d = 5; + std::to_string(d); + } " + HAS_INT64_TO_STRING) + + INCLUDE(CheckCXXSourceRuns) + + CHECK_CXX_SOURCE_RUNS( + " +#include + int main(int, char*[]) { + time_t t = -14210715; // 1969-07-20 12:34:45 + struct tm* ptm = gmtime(&t); + return !(ptm && ptm->tm_year == 69); + } " + HAS_PRE_1970) + + CHECK_CXX_SOURCE_RUNS(" +#include +#include + int main(int, char*[]) { setenv(\"TZ\", \"America/Los_Angeles\", 1); tzset(); struct tm time2037; @@ -112,94 +105,95 @@ CHECK_CXX_SOURCE_RUNS(" strptime(\"2037-05-05 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2037); strptime(\"2038-05-05 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2038); return (mktime(&time2038) - mktime(&time2037)) <= 31500000; - }" - HAS_POST_2038 -) - -set(CMAKE_REQUIRED_INCLUDES ${ZLIB_INCLUDE_DIR}) -set(CMAKE_REQUIRED_LIBRARIES orc_zlib) -CHECK_CXX_SOURCE_COMPILES(" - #define Z_PREFIX - #include - z_stream strm; - int main(int, char *[]) { - deflateReset(&strm); - }" - NEEDS_Z_PREFIX -) - -configure_file ( - "Adaptor.hh.in" - "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh" - ) - -include_directories ( - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${LIBHDFSPP_INCLUDE_DIR} - ) - -add_custom_command(OUTPUT orc_proto.pb.h orc_proto.pb.cc - COMMAND ${PROTOBUF_EXECUTABLE} - -I ${PROJECT_SOURCE_DIR}/proto - --cpp_out="${CMAKE_CURRENT_BINARY_DIR}" - "${PROJECT_SOURCE_DIR}/proto/orc_proto.proto" -) - -set(SOURCE_FILES - "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh" - orc_proto.pb.h - io/InputStream.cc - io/OutputStream.cc - sargs/ExpressionTree.cc - sargs/Literal.cc - sargs/PredicateLeaf.cc - sargs/SargsApplier.cc - sargs/SearchArgument.cc - sargs/TruthValue.cc - wrap/orc-proto-wrapper.cc - Adaptor.cc - BlockBuffer.cc - BloomFilter.cc - ByteRLE.cc - ColumnPrinter.cc - ColumnReader.cc - ColumnWriter.cc - Common.cc - Compression.cc - Exceptions.cc - Int128.cc - LzoDecompressor.cc - MemoryPool.cc - Murmur3.cc - OrcFile.cc - Reader.cc - RLEv1.cc - RLEV2Util.cc - RleDecoderV2.cc - RleEncoderV2.cc - RLE.cc - Statistics.cc - StripeStream.cc - Timezone.cc - TypeImpl.cc - Vector.cc - Writer.cc) - -if(BUILD_LIBHDFSPP) - set(SOURCE_FILES ${SOURCE_FILES} OrcHdfsFile.cc) - add_definitions(-DBUILD_LIBHDFSPP) -endif(BUILD_LIBHDFSPP) - -add_library (orc STATIC ${SOURCE_FILES}) - -target_link_libraries (orc - orc::protobuf - orc::zlib - orc::snappy - orc::lz4 - orc::zstd - ${LIBHDFSPP_LIBRARIES} - ) - -install(TARGETS orc DESTINATION lib) + } " + HAS_POST_2038) + + set(CMAKE_REQUIRED_INCLUDES ${ZLIB_INCLUDE_DIR}) set( + CMAKE_REQUIRED_LIBRARIES + orc_zlib) CHECK_CXX_SOURCE_COMPILES(" +#define Z_PREFIX +#include + z_stream + strm; + int main( + int, + char*[]) { + deflateReset( + &strm); + } " + NEEDS_Z_PREFIX) + + configure_file( + "Adaptor.hh.in" + "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh") + + include_directories(${ + CMAKE_CURRENT_SOURCE_DIR} ${ + CMAKE_CURRENT_BINARY_DIR} ${ + LIBHDFSPP_INCLUDE_DIR}) + + add_custom_command( + OUTPUT orc_proto.pb.h orc_proto.pb + .cc COMMAND ${ + PROTOBUF_EXECUTABLE} - + I ${PROJECT_SOURCE_DIR} / + proto-- cpp_out = + "${CMAKE_CURRENT_BINARY_DIR}" + "${PROJECT_SOURCE_DIR}/proto/" + "orc_proto.proto") + + set(SOURCE_FILES + "${CMAKE_CURRENT_BINARY_" + "DIR}/Adaptor.hh" orc_proto + .pb.h io / + InputStream.cc io / + OutputStream.cc sargs / + ExpressionTree.cc sargs / + Literal.cc sargs / + PredicateLeaf.cc sargs / + SargsApplier.cc sargs / + SearchArgument.cc sargs / + TruthValue.cc wrap / orc - + proto - + wrapper.cc Adaptor + .cc BlockBuffer + .cc BloomFilter.cc ByteRLE + .cc ColumnPrinter + .cc ColumnReader + .cc ColumnWriter.cc Common + .cc Compression + .cc Exceptions.cc Int128 + .cc LzoDecompressor + .cc MemoryPool.cc Murmur3 + .cc OrcFile.cc Reader + .cc RLEv1.cc RLEV2Util + .cc RleDecoderV2 + .cc RleEncoderV2.cc RLE + .cc Statistics + .cc StripeStream.cc Timezone + .cc TypeImpl.cc Vector + .cc Writer.cc CpuInfoUtil + .cc BpackingDefault + .cc BpackingAvx512 + .cc Bpacking.cc) + + if (BUILD_LIBHDFSPP) set( + SOURCE_FILES ${ + SOURCE_FILES} OrcHdfsFile + .cc) add_definitions(-DBUILD_LIBHDFSPP) + endif(BUILD_LIBHDFSPP) + + add_library( + orc STATIC ${ + SOURCE_FILES}) + + target_link_libraries( + orc orc::protobuf + orc::zlib orc::snappy + orc::lz4 orc::zstd + ${LIBHDFSPP_LIBRARIES}) + + install( + TARGETS orc + DESTINATION + lib) diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc new file mode 100644 index 0000000000..02e7a7cb60 --- /dev/null +++ b/c++/src/CpuInfoUtil.cc @@ -0,0 +1,600 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CpuInfoUtil.hh" + +#ifdef __APPLE__ +#include +#endif + +#ifndef _MSC_VER +#include +#endif + +#ifdef _WIN32 +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "orc/Exceptions.hh" + +#undef CPUINFO_ARCH_X86 +#undef CPUINFO_ARCH_ARM +#undef CPUINFO_ARCH_PPC + +#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +#define CPUINFO_ARCH_X86 +#elif defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) +#define CPUINFO_ARCH_ARM +#elif defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) +#define CPUINFO_ARCH_PPC +#endif + +namespace orc { + + namespace { + + constexpr int kCacheLevels = static_cast(CpuInfo::CacheLevel::Last) + 1; + + //============================== OS Dependent ==============================// + +#if defined(_WIN32) + //------------------------------ WINDOWS ------------------------------// + void OsRetrieveCacheSize(std::array* cache_sizes) { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr; + DWORD buffer_size = 0; + size_t offset = 0; + typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void*, void*); + GetLogicalProcessorInformationFuncPointer func_pointer = + (GetLogicalProcessorInformationFuncPointer)GetProcAddress( + GetModuleHandle("kernel32"), "GetLogicalProcessorInformation"); + + if (!func_pointer) { + throw ParseError("Failed to find procedure GetLogicalProcessorInformation"); + } + + // Get buffer size + if (func_pointer(buffer, &buffer_size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + throw ParseError("Failed to get size of processor information buffer"); + } + + buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(buffer_size); + if (!buffer) { + return; + } + + if (!func_pointer(buffer, &buffer_size)) { + free(buffer); + throw ParseError("Failed to get processor information"); + } + + buffer_position = buffer; + while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= buffer_size) { + if (RelationCache == buffer_position->Relationship) { + PCACHE_DESCRIPTOR cache = &buffer_position->Cache; + if (cache->Level >= 1 && cache->Level <= kCacheLevels) { + const int64_t current = (*cache_sizes)[cache->Level - 1]; + (*cache_sizes)[cache->Level - 1] = std::max(current, cache->Size); + } + } + offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + buffer_position++; + } + + free(buffer); + } + +#if defined(CPUINFO_ARCH_X86) + // On x86, get CPU features by cpuid, https://en.wikipedia.org/wiki/CPUID + +#if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5 + void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) { + __asm__ __volatile__("cpuid" + : "=a"(CPUInfo[0]), "=b"(CPUInfo[1]), "=c"(CPUInfo[2]), "=d"(CPUInfo[3]) + : "a"(function_id), "c"(subfunction_id)); + } + + int64_t _xgetbv(int xcr) { + int out = 0; + __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx"); + return out; + } +#endif // MINGW + + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + int register_EAX_id = 1; + int highest_valid_id = 0; + int highest_extended_valid_id = 0; + std::bitset<32> features_ECX; + std::array cpu_info; + + // Get highest valid id + __cpuid(cpu_info.data(), 0); + highest_valid_id = cpu_info[0]; + // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C + // HEX of "AuthenticAMD": 41757468 656E7469 63414D44 + if (cpu_info[1] == 0x756e6547 && cpu_info[3] == 0x49656e69 && cpu_info[2] == 0x6c65746e) { + *vendor = CpuInfo::Vendor::Intel; + } else if (cpu_info[1] == 0x68747541 && cpu_info[3] == 0x69746e65 && + cpu_info[2] == 0x444d4163) { + *vendor = CpuInfo::Vendor::AMD; + } + + if (highest_valid_id <= register_EAX_id) { + return; + } + + // EAX=1: Processor Info and Feature Bits + __cpuidex(cpu_info.data(), register_EAX_id, 0); + features_ECX = cpu_info[2]; + + // Get highest extended id + __cpuid(cpu_info.data(), 0x80000000); + highest_extended_valid_id = cpu_info[0]; + + // Retrieve CPU model name + if (highest_extended_valid_id >= static_cast(0x80000004)) { + model_name->clear(); + for (int i = 0x80000002; i <= static_cast(0x80000004); ++i) { + __cpuidex(cpu_info.data(), i, 0); + *model_name += std::string(reinterpret_cast(cpu_info.data()), sizeof(cpu_info)); + } + } + + bool zmm_enabled = false; + if (features_ECX[27]) { // OSXSAVE + // Query if the OS supports saving ZMM registers when switching contexts + int64_t xcr0 = _xgetbv(0); + zmm_enabled = (xcr0 & 0xE0) == 0xE0; + } + + if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3; + if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1; + if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2; + if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT; + if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX; + + // cpuid with EAX=7, ECX=0: Extended Features + register_EAX_id = 7; + if (highest_valid_id > register_EAX_id) { + __cpuidex(cpu_info.data(), register_EAX_id, 0); + std::bitset<32> features_EBX = cpu_info[1]; + + if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1; + if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2; + if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2; + if (zmm_enabled) { + if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F; + if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ; + if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD; + if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW; + if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL; + } + } + } +#elif defined(CPUINFO_ARCH_ARM) + // Windows on Arm + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + *hardware_flags |= CpuInfo::ASIMD; + // TODO: vendor, model_name + } +#endif + +#elif defined(__APPLE__) + //------------------------------ MACOS ------------------------------// + std::optional IntegerSysCtlByName(const char* name) { + size_t len = sizeof(int64_t); + int64_t data = 0; + if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) { + return data; + } + // ENOENT is the official errno value for non-existing sysctl's, + // but EINVAL and ENOTSUP have been seen in the wild. + if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) { + auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'"); + throw ParseError(st.ToString()); + } + return std::nullopt; + } + + void OsRetrieveCacheSize(std::array* cache_sizes) { + static_assert(kCacheLevels >= 3, ""); + auto c = IntegerSysCtlByName("hw.l1dcachesize"); + if (c.has_value()) { + (*cache_sizes)[0] = *c; + } + c = IntegerSysCtlByName("hw.l2cachesize"); + if (c.has_value()) { + (*cache_sizes)[1] = *c; + } + c = IntegerSysCtlByName("hw.l3cachesize"); + if (c.has_value()) { + (*cache_sizes)[2] = *c; + } + } + + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + // hardware_flags + struct SysCtlCpuFeature { + const char* name; + int64_t flag; + }; + std::vector features = { +#if defined(CPUINFO_ARCH_X86) + {"hw.optional.sse4_2", + CpuInfo::SSSE3 | CpuInfo::SSE4_1 | CpuInfo::SSE4_2 | CpuInfo::POPCNT}, + {"hw.optional.avx1_0", CpuInfo::AVX}, + {"hw.optional.avx2_0", CpuInfo::AVX2}, + {"hw.optional.bmi1", CpuInfo::BMI1}, + {"hw.optional.bmi2", CpuInfo::BMI2}, + {"hw.optional.avx512f", CpuInfo::AVX512F}, + {"hw.optional.avx512cd", CpuInfo::AVX512CD}, + {"hw.optional.avx512dq", CpuInfo::AVX512DQ}, + {"hw.optional.avx512bw", CpuInfo::AVX512BW}, + {"hw.optional.avx512vl", CpuInfo::AVX512VL}, +#elif defined(CPUINFO_ARCH_ARM) + // ARM64 (note that this is exposed under Rosetta as well) + {"hw.optional.neon", CpuInfo::ASIMD}, +#endif + }; + for (const auto& feature : features) { + auto v = IntegerSysCtlByName(feature.name); + if (v.value_or(0)) { + *hardware_flags |= feature.flag; + } + } + + // TODO: vendor, model_name + } + +#else + //------------------------------ LINUX ------------------------------// + // Get cache size, return 0 on error + int64_t LinuxGetCacheSize(int level) { + // get cache size by sysconf() +#ifdef _SC_LEVEL1_DCACHE_SIZE + const int kCacheSizeConf[] = { + _SC_LEVEL1_DCACHE_SIZE, + _SC_LEVEL2_CACHE_SIZE, + _SC_LEVEL3_CACHE_SIZE, + }; + static_assert(sizeof(kCacheSizeConf) / sizeof(kCacheSizeConf[0]) == kCacheLevels, ""); + + errno = 0; + const int64_t cache_size = sysconf(kCacheSizeConf[level]); + if (errno == 0 && cache_size > 0) { + return cache_size; + } +#endif + + // get cache size from sysfs if sysconf() fails or not supported + const char* kCacheSizeSysfs[] = { + "/sys/devices/system/cpu/cpu0/cache/index0/size", // l1d (index1 is l1i) + "/sys/devices/system/cpu/cpu0/cache/index2/size", // l2 + "/sys/devices/system/cpu/cpu0/cache/index3/size", // l3 + }; + static_assert(sizeof(kCacheSizeSysfs) / sizeof(kCacheSizeSysfs[0]) == kCacheLevels, ""); + + std::ifstream cacheinfo(kCacheSizeSysfs[level], std::ios::in); + if (!cacheinfo) { + return 0; + } + // cacheinfo is one line like: 65536, 64K, 1M, etc. + uint64_t size = 0; + char unit = '\0'; + cacheinfo >> size >> unit; + if (unit == 'K') { + size <<= 10; + } else if (unit == 'M') { + size <<= 20; + } else if (unit == 'G') { + size <<= 30; + } else if (unit != '\0') { + return 0; + } + return static_cast(size); + } + + // Helper function to parse for hardware flags from /proc/cpuinfo + // values contains a list of space-separated flags. check to see if the flags we + // care about are present. + // Returns a bitmap of flags. + int64_t LinuxParseCpuFlags(const std::string& values) { + const struct { + std::string name; + int64_t flag; + } flag_mappings[] = { +#if defined(CPUINFO_ARCH_X86) + {"ssse3", CpuInfo::SSSE3}, + {"sse4_1", CpuInfo::SSE4_1}, + {"sse4_2", CpuInfo::SSE4_2}, + {"popcnt", CpuInfo::POPCNT}, + {"avx", CpuInfo::AVX}, + {"avx2", CpuInfo::AVX2}, + {"avx512f", CpuInfo::AVX512F}, + {"avx512cd", CpuInfo::AVX512CD}, + {"avx512vl", CpuInfo::AVX512VL}, + {"avx512dq", CpuInfo::AVX512DQ}, + {"avx512bw", CpuInfo::AVX512BW}, + {"bmi1", CpuInfo::BMI1}, + {"bmi2", CpuInfo::BMI2}, +#elif defined(CPUINFO_ARCH_ARM) + {"asimd", CpuInfo::ASIMD}, +#endif + }; + const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); + + int64_t flags = 0; + for (int i = 0; i < num_flags; ++i) { + if (values.find(flag_mappings[i].name) != std::string::npos) { + flags |= flag_mappings[i].flag; + } + } + return flags; + } + + void OsRetrieveCacheSize(std::array* cache_sizes) { + for (int i = 0; i < kCacheLevels; ++i) { + const int64_t cache_size = LinuxGetCacheSize(i); + if (cache_size > 0) { + (*cache_sizes)[i] = cache_size; + } + } + } + + static constexpr bool IsWhitespace(char c) { + return c == ' ' || c == '\t'; + } + + std::string TrimString(std::string value) { + size_t ltrim_chars = 0; + while (ltrim_chars < value.size() && IsWhitespace(value[ltrim_chars])) { + ++ltrim_chars; + } + value.erase(0, ltrim_chars); + size_t rtrim_chars = 0; + while (rtrim_chars < value.size() && IsWhitespace(value[value.size() - 1 - rtrim_chars])) { + ++rtrim_chars; + } + value.erase(value.size() - rtrim_chars, rtrim_chars); + return value; + } + + // Read from /proc/cpuinfo + // TODO: vendor, model_name for Arm + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in); + while (cpuinfo) { + std::string line; + std::getline(cpuinfo, line); + const size_t colon = line.find(':'); + if (colon != std::string::npos) { + const std::string name = TrimString(line.substr(0, colon - 1)); + const std::string value = TrimString(line.substr(colon + 1, std::string::npos)); + if (name.compare("flags") == 0 || name.compare("Features") == 0) { + *hardware_flags |= LinuxParseCpuFlags(value); + } else if (name.compare("model name") == 0) { + *model_name = value; + } else if (name.compare("vendor_id") == 0) { + if (value.compare("GenuineIntel") == 0) { + *vendor = CpuInfo::Vendor::Intel; + } else if (value.compare("AuthenticAMD") == 0) { + *vendor = CpuInfo::Vendor::AMD; + } + } + } + } + } +#endif // WINDOWS, MACOS, LINUX + + //============================== Arch Dependent ==============================// + +#if defined(CPUINFO_ARCH_X86) + //------------------------------ X86_64 ------------------------------// + bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + enum { + USER_SIMD_NONE, + USER_SIMD_SSE4_2, + USER_SIMD_AVX, + USER_SIMD_AVX2, + USER_SIMD_AVX512, + USER_SIMD_MAX, + }; + + int level = USER_SIMD_MAX; + // Parse the level + if (simd_level == "AVX512") { + level = USER_SIMD_AVX512; + } else if (simd_level == "AVX2") { + level = USER_SIMD_AVX2; + } else if (simd_level == "AVX") { + level = USER_SIMD_AVX; + } else if (simd_level == "SSE4_2") { + level = USER_SIMD_SSE4_2; + } else if (simd_level == "NONE") { + level = USER_SIMD_NONE; + } else { + return false; + } + + // Disable feature as the level + if (level < USER_SIMD_AVX512) { + *hardware_flags &= ~CpuInfo::AVX512; + } + if (level < USER_SIMD_AVX2) { + *hardware_flags &= ~(CpuInfo::AVX2 | CpuInfo::BMI2); + } + if (level < USER_SIMD_AVX) { + *hardware_flags &= ~CpuInfo::AVX; + } + if (level < USER_SIMD_SSE4_2) { + *hardware_flags &= ~(CpuInfo::SSE4_2 | CpuInfo::BMI1); + } + return true; + } + + void ArchVerifyCpuRequirements(const CpuInfo* ci) { +#if defined(ORC_HAVE_AVX512) + if (!ci->IsDetected(CpuInfo::AVX512)) { + throw ParseError("CPU does not support the Supplemental AVX512 instruction set"); + } +#endif + } + +#elif defined(CPUINFO_ARCH_ARM) + //------------------------------ AARCH64 ------------------------------// + bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + if (simd_level == "NONE") { + *hardware_flags &= ~CpuInfo::ASIMD; + return true; + } + return false; + } + + void ArchVerifyCpuRequirements(const CpuInfo* ci) { + if (!ci->IsDetected(CpuInfo::ASIMD)) { + throw ParseError("CPU does not support the Armv8 Neon instruction set"); + } + } + +#else + //------------------------------ PPC, ... ------------------------------// + bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + return true; + } + + void ArchVerifyCpuRequirements(const CpuInfo* ci) {} + +#endif // X86, ARM, PPC + + } // namespace + + struct CpuInfo::Impl { + int64_t hardware_flags = 0; + int numCores = 0; + int64_t original_hardware_flags = 0; + Vendor vendor = Vendor::Unknown; + std::string model_name = "Unknown"; + std::array cache_sizes{}; + + Impl() { + OsRetrieveCacheSize(&cache_sizes); + OsRetrieveCpuInfo(&hardware_flags, &vendor, &model_name); + original_hardware_flags = hardware_flags; + numCores = std::max(static_cast(std::thread::hardware_concurrency()), 1); + + // parse user simd level + const auto maybe_env_var = std::getenv("ORC_USER_SIMD_LEVEL"); + std::string userSimdLevel = maybe_env_var == nullptr ? "NONE" : std::string(maybe_env_var); + std::transform(userSimdLevel.begin(), userSimdLevel.end(), userSimdLevel.begin(), + [](unsigned char c) { return std::toupper(c); }); + if (!ArchParseUserSimdLevel(userSimdLevel, &hardware_flags)) { + throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel); + } + } + + // void EnableFeature(int64_t flag, bool enable) { + // if (!enable) { + // hardware_flags &= ~flag; + // } else { + // // Can't turn something on that can't be supported + // DCHECK_EQ((~original_hardware_flags) & flag, 0); + // hardware_flags |= (flag & original_hardware_flags); + // } + // } + }; + + CpuInfo::~CpuInfo() = default; + + CpuInfo::CpuInfo() : impl_(new Impl) {} + + const CpuInfo* CpuInfo::GetInstance() { + static CpuInfo cpu_info; + return &cpu_info; + } + + int64_t CpuInfo::hardwareFlags() const { + return impl_->hardware_flags; + } + + int CpuInfo::numCores() const { + return impl_->numCores <= 0 ? 1 : impl_->numCores; + } + + CpuInfo::Vendor CpuInfo::vendor() const { + return impl_->vendor; + } + + const std::string& CpuInfo::modelName() const { + return impl_->model_name; + } + + int64_t CpuInfo::CacheSize(CacheLevel level) const { + constexpr int64_t kDefaultCacheSizes[] = { + 32 * 1024, // Level 1: 32K + 256 * 1024, // Level 2: 256K + 3072 * 1024, // Level 3: 3M + }; + static_assert(sizeof(kDefaultCacheSizes) / sizeof(kDefaultCacheSizes[0]) == kCacheLevels, ""); + + static_assert(static_cast(CacheLevel::L1) == 0, ""); + const int i = static_cast(level); + if (impl_->cache_sizes[i] > 0) return impl_->cache_sizes[i]; + if (i == 0) return kDefaultCacheSizes[0]; + // l3 may be not available, return maximum of l2 or default size + return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]); + } + + bool CpuInfo::IsSupported(int64_t flags) const { + return (impl_->hardware_flags & flags) == flags; + } + + bool CpuInfo::IsDetected(int64_t flags) const { + return (impl_->original_hardware_flags & flags) == flags; + } + + void CpuInfo::VerifyCpuRequirements() const { + return ArchVerifyCpuRequirements(this); + } + + // void CpuInfo::EnableFeature(int64_t flag, bool enable) { + // impl_->EnableFeature(flag, enable); + // } + +} // namespace orc + +#undef CPUINFO_ARCH_X86 +#undef CPUINFO_ARCH_ARM +#undef CPUINFO_ARCH_PPC diff --git a/c++/src/CpuInfoUtil.hh b/c++/src/CpuInfoUtil.hh new file mode 100644 index 0000000000..dfc3a75843 --- /dev/null +++ b/c++/src/CpuInfoUtil.hh @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_CPUINFOUTIL_HH +#define ORC_CPUINFOUTIL_HH + +#include +#include +#include + +namespace orc { + + /// CpuInfo is an interface to query for cpu information at runtime. The caller can + /// ask for the sizes of the caches and what hardware features are supported. + /// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and + /// /sys/devices) + class CpuInfo { + public: + ~CpuInfo(); + + /// x86 features + static constexpr int64_t SSSE3 = (1LL << 0); + static constexpr int64_t SSE4_1 = (1LL << 1); + static constexpr int64_t SSE4_2 = (1LL << 2); + static constexpr int64_t POPCNT = (1LL << 3); + static constexpr int64_t AVX = (1LL << 4); + static constexpr int64_t AVX2 = (1LL << 5); + static constexpr int64_t AVX512F = (1LL << 6); + static constexpr int64_t AVX512CD = (1LL << 7); + static constexpr int64_t AVX512VL = (1LL << 8); + static constexpr int64_t AVX512DQ = (1LL << 9); + static constexpr int64_t AVX512BW = (1LL << 10); + static constexpr int64_t AVX512 = AVX512F | AVX512CD | AVX512VL | AVX512DQ | AVX512BW; + static constexpr int64_t BMI1 = (1LL << 11); + static constexpr int64_t BMI2 = (1LL << 12); + + /// Arm features + static constexpr int64_t ASIMD = (1LL << 32); + + /// Cache enums for L1 (data), L2 and L3 + enum class CacheLevel { L1 = 0, L2, L3, Last = L3 }; + + /// CPU vendors + enum class Vendor { Unknown, Intel, AMD }; + + static const CpuInfo* GetInstance(); + + /// Returns all the flags for this cpu + int64_t hardwareFlags() const; + + /// Returns the number of cores (including hyper-threaded) on this machine. + int numCores() const; + + /// Returns the vendor of the cpu. + Vendor vendor() const; + + /// Returns the model name of the cpu (e.g. Intel i7-2600) + const std::string& modelName() const; + + /// Returns the size of the cache in KB at this cache level + int64_t CacheSize(CacheLevel level) const; + + /// \brief Returns whether or not the given feature is enabled. + /// + /// IsSupported() is true if IsDetected() is also true and the feature + /// wasn't disabled by the user (for example by setting the ORC_USER_SIMD_LEVEL + /// environment variable). + bool IsSupported(int64_t flags) const; + + /// Returns whether or not the given feature is available on the CPU. + bool IsDetected(int64_t flags) const; + + /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error + /// and terminate. + void VerifyCpuRequirements() const; + + /// Toggle a hardware feature on and off. It is not valid to turn on a feature + /// that the underlying hardware cannot support. This is useful for testing. + // void EnableFeature(int64_t flag, bool enable); + + bool HasEfficientBmi2() const { + // BMI2 (pext, pdep) is only efficient on Intel X86 processors. + return vendor() == Vendor::Intel && IsSupported(BMI2); + } + + private: + CpuInfo(); + + struct Impl; + std::unique_ptr impl_; + }; + +} // namespace orc + +#endif diff --git a/c++/src/DetectPlatform.hh b/c++/src/DetectPlatform.hh deleted file mode 100644 index a1df00b1e8..0000000000 --- a/c++/src/DetectPlatform.hh +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_DETECTPLATFORM_HH -#define ORC_DETECTPLATFORM_HH - -#if defined(__GNUC__) || defined(__clang__) -DIAGNOSTIC_IGNORE("-Wold-style-cast") -#endif - -namespace orc { -#ifdef _WIN32 - -#include "intrin.h" -// Windows CPUID -#define cpuid(info, x) __cpuidex(info, x, 0) -#else -// GCC Intrinsics -#include -#include - - void cpuid(int info[4], int InfoType) { - __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); - } - - unsigned long long xgetbv(unsigned int index) { - unsigned int eax, edx; - __asm__ __volatile__("xgetbv;" : "=a"(eax), "=d"(edx) : "c"(index)); - return ((unsigned long long)edx << 32) | eax; - } - -#endif - -#define CPUID_AVX512F 0x00100000 -#define CPUID_AVX512CD 0x00200000 -#define CPUID_AVX512VL 0x04000000 -#define CPUID_AVX512BW 0x01000000 -#define CPUID_AVX512DQ 0x02000000 -#define EXC_OSXSAVE 0x08000000 // 27th bit - -#define CPUID_AVX512_MASK \ - (CPUID_AVX512F | CPUID_AVX512CD | CPUID_AVX512VL | CPUID_AVX512BW | CPUID_AVX512DQ) - - enum class Arch { PX_ARCH = 0, AVX2_ARCH = 1, AVX512_ARCH = 2 }; - - Arch detectPlatform() { - Arch detectedPlatform = Arch::PX_ARCH; - int cpuInfo[4]; - cpuid(cpuInfo, 1); - - bool avx512SupportCpu = cpuInfo[1] & CPUID_AVX512_MASK; - bool osUsesXSaveXStore = cpuInfo[2] & EXC_OSXSAVE; - - if (avx512SupportCpu && osUsesXSaveXStore) { - // Check if XMM state and YMM state are saved -#ifdef _WIN32 - unsigned long long xcrFeatureMask = _xgetbv(0); /* min VS2010 SP1 compiler is required */ -#else - unsigned long long xcrFeatureMask = xgetbv(0); -#endif - - if ((xcrFeatureMask & 0x6) == 0x6) { // AVX2 is supported now - if ((xcrFeatureMask & 0xe0) == 0xe0) { // AVX512 is supported now - detectedPlatform = Arch::AVX512_ARCH; - } - } - } - - return detectedPlatform; - } -} // namespace orc - -#endif diff --git a/c++/src/Dispatch.hh b/c++/src/Dispatch.hh new file mode 100644 index 0000000000..4185ad9b48 --- /dev/null +++ b/c++/src/Dispatch.hh @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_DISPATCH_HH +#define ORC_DISPATCH_HH + +#include +#include + +#include "CpuInfoUtil.hh" + +namespace orc { + enum class DispatchLevel : int { + // These dispatch levels, corresponding to instruction set features, + // are sorted in increasing order of preference. + NONE = 0, + AVX512, + MAX + }; + + /* + A facility for dynamic dispatch according to available DispatchLevel. + + Typical use: + + static void my_function_default(...); + static void my_function_avx512(...); + + struct MyDynamicFunction { + using FunctionType = decltype(&my_function_default); + + static std::vector> implementations() { + return { + { DispatchLevel::NONE, my_function_default } + #if defined(ARROW_HAVE_RUNTIME_AVX512) + , { DispatchLevel::AVX512, my_function_avx512 } + #endif + }; + } + }; + + void my_function(...) { + static DynamicDispatch dispatch; + return dispatch.func(...); + } + */ + template + class DynamicDispatch { + protected: + using FunctionType = typename DynamicFunction::FunctionType; + using Implementation = std::pair; + + public: + DynamicDispatch() { + Resolve(DynamicFunction::implementations()); + } + + FunctionType func = {}; + + protected: + // Use the Implementation with the highest DispatchLevel + void Resolve(const std::vector& implementations) { + Implementation cur{DispatchLevel::NONE, {}}; + + for (const auto& impl : implementations) { + if (impl.first >= cur.first && IsSupported(impl.first)) { + // Higher (or same) level than current + cur = impl; + } + } + + if (!cur.second) { + throw InvalidArgument("No appropriate implementation found"); + } + func = cur.second; + } + + private: + bool IsSupported(DispatchLevel level) const { + static const auto cpu_info = orc::CpuInfo::GetInstance(); + + switch (level) { + case DispatchLevel::NONE: + return true; + case DispatchLevel::AVX512: + return cpu_info->IsSupported(CpuInfo::AVX512); + default: + return false; + } + } + }; +} // namespace orc + +#endif diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index 20ee402346..6dac427c9e 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -22,12 +22,15 @@ #include "Adaptor.hh" #include "RLE.hh" #include "orc/Exceptions.hh" +// #include "BpackingDefault.hh" +// #include "Bpacking.hh" #include #define MAX_VECTOR_BUF_8BIT_LENGTH 64 #define MAX_VECTOR_BUF_16BIT_LENGTH 32 #define MAX_VECTOR_BUF_32BIT_LENGTH 16 + #define MAX_LITERAL_SIZE 512 #define MIN_REPEAT 3 #define HIST_LEN 32 @@ -169,6 +172,24 @@ namespace orc { void next(int16_t* data, uint64_t numValues, const char* notNull) override; + // + unsigned char readByte(char** bufStart, char** bufEnd); + void resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, bool resetBuf, + uint32_t backupLen); + + char* getBufferStart() { + return bufferStart; + } + + char* getBufferEnd() { + return bufferEnd; + } + char* bufferStart; + char* bufferEnd; + uint32_t bitsLeft; // Used by readLongs when bitSize < 8 + uint32_t curByte; // Used by anything that uses readLongs + // + private: /** * Decode the next gap and patch from 'unpackedPatch' and update the index on it. @@ -192,46 +213,16 @@ namespace orc { resetReadLongs(); } - void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); - unsigned char readByte(); + // void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); + // unsigned char readByte(); int64_t readLongBE(uint64_t bsz); int64_t readVslong(); uint64_t readVulong(); - void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + int readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, uint64_t& startBit); -#if defined(ORC_HAVE_RUNTIME_AVX512) - void unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector6(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector7(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector9(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector10(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector11(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector12(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector13(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector14(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector15(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector16(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector17(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector18(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector19(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector20(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector21(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector22(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector23(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector24(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector26(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector28(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector30(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector32(int64_t* data, uint64_t offset, uint64_t len); -#endif - void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len); void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len); void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len); @@ -259,12 +250,13 @@ namespace orc { unsigned char firstByte; uint64_t runLength; // Length of the current run uint64_t runRead; // Number of returned values of the current run - const char* bufferStart; - const char* bufferEnd; - uint32_t bitsLeft; // Used by readLongs when bitSize < 8 - uint32_t curByte; // Used by anything that uses readLongs + // char* bufferStart; + // char* bufferEnd; + // uint32_t bitsLeft; // Used by readLongs when bitSize < 8 + // uint32_t curByte; // Used by anything that uses readLongs DataBuffer unpackedPatch; // Used by PATCHED_BASE DataBuffer literals; // Values of the current run + #if defined(ORC_HAVE_RUNTIME_AVX512) uint8_t vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 30b36e8c48..c8c2082e61 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -18,15 +18,22 @@ #include "Adaptor.hh" #include "Compression.hh" -#include "DetectPlatform.hh" +// #include "DetectPlatform.hh" +#include "Bpacking.hh" +#include "Dispatch.hh" #include "RLEV2Util.hh" #include "RLEv2.hh" #include "Utils.hh" -#include "VectorDecoder.hh" +#if defined(ORC_HAVE_RUNTIME_AVX512) +#include "BpackingAvx512.hh" +// #include "BitUnpackerAvx512.hh" +#endif namespace orc { - void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { - uint64_t restLen = bufferEnd - bufferStart; + + void RleDecoderV2::resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, bool resetBuf, + uint32_t backupByteLen) { + uint64_t remainingLen = *bufEnd - *bufStart; int bufferLength = 0; const void* bufferPointer = nullptr; @@ -34,33 +41,33 @@ namespace orc { inputStream->BackUp(backupByteLen); } - if (len >= restLen && resetBuf == true) { + if (len >= remainingLen && resetBuf == true) { if (!inputStream->Next(&bufferPointer, &bufferLength)) { throw ParseError("bad read in RleDecoderV2::resetBufferStart"); } } if (bufferPointer == nullptr) { - bufferStart += len; + *bufStart += len; } else { - bufferStart = static_cast(bufferPointer); - bufferEnd = bufferStart + bufferLength; + *bufStart = (char*)bufferPointer; + *bufEnd = *bufStart + bufferLength; } } - unsigned char RleDecoderV2::readByte() { + unsigned char RleDecoderV2::readByte(char** bufStart, char** bufEnd) { SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); - if (bufferStart == bufferEnd) { + if (*bufStart == *bufEnd) { int bufferLength; const void* bufferPointer; if (!inputStream->Next(&bufferPointer, &bufferLength)) { throw ParseError("bad read in RleDecoderV2::readByte"); } - bufferStart = static_cast(bufferPointer); - bufferEnd = bufferStart + bufferLength; + *bufStart = (char*)bufferPointer; + *bufEnd = *bufStart + bufferLength; } - unsigned char result = static_cast(*bufferStart++); + unsigned char result = static_cast(*(*bufStart)++); return result; } @@ -69,7 +76,7 @@ namespace orc { uint64_t n = bsz; while (n > 0) { n--; - val = readByte(); + val = readByte(&bufferStart, &bufferEnd); ret |= (val << (n * 8)); } return ret; @@ -83,4643 +90,45 @@ namespace orc { uint64_t ret = 0, b; uint64_t offset = 0; do { - b = readByte(); + b = readByte(&bufferStart, &bufferEnd); ret |= (0x7f & b) << offset; offset += 7; } while (b >= 0x80); return ret; } - void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { - uint64_t startBit = 0; -#if defined(ORC_HAVE_RUNTIME_AVX512) - const auto runtimeEnable = getenv("ENABLE_RUNTIME_AVX512"); - std::string avxRuntimeEnable = runtimeEnable == nullptr ? "OFF" : std::string(runtimeEnable); - if (detectPlatform() == Arch::AVX512_ARCH && strcasecmp(avxRuntimeEnable.c_str(), "on") == 0) { - switch (fbs) { - case 1: - unrolledUnpackVector1(data, offset, len); - return; - case 2: - unrolledUnpackVector2(data, offset, len); - return; - case 3: - unrolledUnpackVector3(data, offset, len); - return; - case 4: - unrolledUnpackVector4(data, offset, len); - return; - case 5: - unrolledUnpackVector5(data, offset, len); - return; - case 6: - unrolledUnpackVector6(data, offset, len); - return; - case 7: - unrolledUnpackVector7(data, offset, len); - return; - case 8: - unrolledUnpack8(data, offset, len); - return; - case 9: - unrolledUnpackVector9(data, offset, len); - return; - case 10: - unrolledUnpackVector10(data, offset, len); - return; - case 11: - unrolledUnpackVector11(data, offset, len); - return; - case 12: - unrolledUnpackVector12(data, offset, len); - return; - case 13: - unrolledUnpackVector13(data, offset, len); - return; - case 14: - unrolledUnpackVector14(data, offset, len); - return; - case 15: - unrolledUnpackVector15(data, offset, len); - return; - case 16: - unrolledUnpackVector16(data, offset, len); - return; - case 17: - unrolledUnpackVector17(data, offset, len); - return; - case 18: - unrolledUnpackVector18(data, offset, len); - return; - case 19: - unrolledUnpackVector19(data, offset, len); - return; - case 20: - unrolledUnpackVector20(data, offset, len); - return; - case 21: - unrolledUnpackVector21(data, offset, len); - return; - case 22: - unrolledUnpackVector22(data, offset, len); - return; - case 23: - unrolledUnpackVector23(data, offset, len); - return; - case 24: - unrolledUnpackVector24(data, offset, len); - return; - case 26: - unrolledUnpackVector26(data, offset, len); - return; - case 28: - unrolledUnpackVector28(data, offset, len); - return; - case 30: - unrolledUnpackVector30(data, offset, len); - return; - case 32: - unrolledUnpackVector32(data, offset, len); - return; - case 40: - unrolledUnpack40(data, offset, len); - return; - case 48: - unrolledUnpack48(data, offset, len); - return; - case 56: - unrolledUnpack56(data, offset, len); - return; - case 64: - unrolledUnpack64(data, offset, len); - return; - default: - // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs, startBit); - return; - } - } else { - switch (fbs) { - case 4: - unrolledUnpack4(data, offset, len); - return; - case 8: - unrolledUnpack8(data, offset, len); - return; - case 16: - unrolledUnpack16(data, offset, len); - return; - case 24: - unrolledUnpack24(data, offset, len); - return; - case 32: - unrolledUnpack32(data, offset, len); - return; - case 40: - unrolledUnpack40(data, offset, len); - return; - case 48: - unrolledUnpack48(data, offset, len); - return; - case 56: - unrolledUnpack56(data, offset, len); - return; - case 64: - unrolledUnpack64(data, offset, len); - return; - default: - // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs, startBit); - return; - } - } -#else - switch (fbs) { - case 4: - unrolledUnpack4(data, offset, len); - return; - case 8: - unrolledUnpack8(data, offset, len); - return; - case 16: - unrolledUnpack16(data, offset, len); - return; - case 24: - unrolledUnpack24(data, offset, len); - return; - case 32: - unrolledUnpack32(data, offset, len); - return; - case 40: - unrolledUnpack40(data, offset, len); - return; - case 48: - unrolledUnpack48(data, offset, len); - return; - case 56: - unrolledUnpack56(data, offset, len); - return; - case 64: - unrolledUnpack64(data, offset, len); - return; - default: - // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs, startBit); - return; - } -#endif - } + /////// + struct UnpackDynamicFunction { + using FunctionType = decltype(&readLongsDefault); + static std::vector> implementations() { + return {{DispatchLevel::NONE, readLongsDefault} #if defined(ORC_HAVE_RUNTIME_AVX512) - void RleDecoderV2::unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 1; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint32_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __m512i reverseMask1u = _mm512_load_si512(reverseMaskTable1u); - while (numElements >= 64) { - uint64_t src_64 = *(uint64_t*)srcPtr; - // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF - __m512i srcmm = _mm512_movm_epi8(src_64); - // make 0x00 --> 0x00, 0xFF --> 0x01 - srcmm = _mm512_abs_epi8(srcmm); - srcmm = _mm512_shuffle_epi8(srcmm, reverseMask1u); - _mm512_storeu_si512(vectorBuf8, srcmm); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 2; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint32_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements) - __m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0 - while (numElements >= 64) { - __m512i srcmm3 = _mm512_maskz_loadu_epi8(readMask, srcPtr); - __m512i srcmm0, srcmm1, srcmm2, tmpmm; - - srcmm2 = _mm512_srli_epi16(srcmm3, 2); - srcmm1 = _mm512_srli_epi16(srcmm3, 4); - srcmm0 = _mm512_srli_epi16(srcmm3, 6); - - // turn 2 bitWidth into 8 by zeroing 3 of each 4 elements. - // move them into their places - // srcmm0: a e i m 0 0 0 0 0 0 0 0 0 0 0 0 - // srcmm1: b f j n 0 0 0 0 0 0 0 0 0 0 0 0 - tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 00 00 00 00 - srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // ij mn 00 00 00 00 00 00 - srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x00); // ab ef ab ef ij mn ij mn - - // srcmm2: c g k o 0 0 0 0 0 0 0 0 0 0 0 0 - // srcmm3: d h l p 0 0 0 0 0 0 0 0 0 0 0 0 - tmpmm = _mm512_unpacklo_epi8(srcmm2, srcmm3); // cd gh 00 00 00 00 00 00 - srcmm1 = _mm512_unpackhi_epi8(srcmm2, srcmm3); // kl op 00 00 00 00 00 00 - srcmm1 = _mm512_shuffle_i64x2(tmpmm, srcmm1, 0x00); // cd gh cd gh kl op kl op - - tmpmm = _mm512_unpacklo_epi16(srcmm0, srcmm1); // abcd abcd ijkl ijkl - srcmm0 = _mm512_unpackhi_epi16(srcmm0, srcmm1); // efgh efgh mnop mnop - srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x88); // abcd ijkl efgh mnop - srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // abcd efgh ijkl mnop - - srcmm0 = _mm512_and_si512(srcmm0, parse_mask); - - _mm512_storeu_si512(vectorBuf8, srcmm0); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 3; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint32_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable3u); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable3u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable3u_1); - - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable3u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable3u_1); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask); - - _mm512_storeu_si512(vectorBuf8, zmm[0]); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 4; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements) - __m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0 - while (numElements >= 64) { - __m512i srcmm0, srcmm1, tmpmm; - - srcmm1 = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm0 = _mm512_srli_epi16(srcmm1, 4); - - // move elements into their places - // srcmm0: a c e g 0 0 0 0 - // srcmm1: b d f h 0 0 0 0 - tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 - srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // cd gh 00 00 - srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x44); // ab ef cd gh - srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // ab cd ef gh - - // turn 4 bitWidth into 8 by zeroing 4 of each 8 bits. - srcmm0 = _mm512_and_si512(srcmm0, parseMask); - - _mm512_storeu_si512(vectorBuf8, srcmm0); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 5; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable5u); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable5u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable5u_1); - - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable5u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable5u_1); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask); - - _mm512_storeu_si512(vectorBuf8, zmm[0]); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); + , + {DispatchLevel::AVX512, readLongsAvx512} +#endif + }; } - } - - void RleDecoderV2::unrolledUnpackVector6(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 6; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable6u); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable6u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable6u_1); - - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable6u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable6u_1); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask); - - _mm512_storeu_si512(vectorBuf8, zmm[0]); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } + }; - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector7(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 7; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 64) { - __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); - __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable7u); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable7u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable7u_1); - - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable7u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable7u_1); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask); - - _mm512_storeu_si512(vectorBuf8, zmm[0]); - - srcPtr += 8 * bitWidth; - resetBufferStart(8 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector9(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 9; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable9u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable9u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable9u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable9u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable9u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable9u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable9u); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - if (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi16(zmm[0], 7); - - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector10(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 10; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable10u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable10u); - __m512i shiftMask = _mm512_load_si512(shiftTable10u); - - while (numElements >= 32) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm = _mm512_srlv_epi16(zmm, shiftMask); - zmm = _mm512_and_si512(zmm, parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector11(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 11; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable11u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable11u_1); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable11u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable11u_1); - - __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable11u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable11u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable11u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable11u_3); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable11u); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - if (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi16(zmm[0], 5); - - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector12(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 12; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable12u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable12u); - __m512i shiftMask = _mm512_load_si512(shiftTable12u); - - while (numElements >= 32) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm = _mm512_srlv_epi16(zmm, shiftMask); - zmm = _mm512_and_si512(zmm, parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector13(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 13; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable13u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable13u_1); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable13u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable13u_1); - - __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable13u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable13u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable13u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable13u_3); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable13u); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - if (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi16(zmm[0], 3); - - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector14(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 14; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable14u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable14u_1); - - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable14u); - - __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable14u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable14u_1); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector15(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 15; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 32) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); - __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable15u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable15u_1); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable15u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable15u_1); - - __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable15u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable15u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable15u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable15u_3); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable15u); - - while (numElements >= 64) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - if (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi16(zmm[0], 1); - - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); - - _mm512_storeu_si512(vectorBuf16, zmm[0]); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector16(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 16; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = len; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - int64_t* dstPtr = data + offset; - bool resetBuf = false; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - } else { - numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (numElements >= 32) { - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); - while (numElements >= 32) { - __m512i srcmm = _mm512_loadu_si512(srcPtr); - srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u); - _mm512_storeu_si512(vectorBuf16, srcmm); - - srcPtr += 4 * bitWidth; - resetBufferStart(4 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; - } - } - - if (numElements > 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - unrolledUnpack16(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - ; - unrolledUnpack16(dstPtr, 0, 1); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector17(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 17; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable17u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable17u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable17u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable17u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable17u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable17u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable17u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 15); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector18(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 18; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable18u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable18u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable18u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable18u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable18u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable18u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable18u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 14); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector19(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 19; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable19u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable19u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable19u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable19u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable19u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable19u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable19u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 13); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector20(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 20; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0u) { - uint32_t align = getAlign(startBit, bitWidth, 32u); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16u) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable20u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable20u); - __m512i shiftMask = _mm512_load_si512(shiftTable20u); - - while (numElements >= 16u) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm = _mm512_srlv_epi32(zmm, shiftMask); - zmm = _mm512_and_si512(zmm, parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector21(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 21; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0u) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable21u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable21u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable21u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable21u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable21u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable21u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable21u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 11); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector22(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 22; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable22u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable22u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable22u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable22u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable22u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable22u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable22u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 10); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector23(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 23; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable23u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable23u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable23u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable23u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable23u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable23u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable23u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 9); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector24(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 24; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - } else { - numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - - __m512i shuffleIdx = _mm512_load_si512(shuffleIdxTable24u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable24u); - - while (numElements >= 16) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdx); - - _mm512_storeu_si512(vectorBuf32, zmm); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - unrolledUnpack24(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - ; - unrolledUnpack24(dstPtr, 0, 1); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector26(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 26; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable26u_0); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable26u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable26u_1); - - __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable26u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable26u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable26u_2); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable26u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); - - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 6); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector28(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 28; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable28u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable28u); - __m512i shiftMask = _mm512_load_si512(shiftTable28u); - - while (numElements >= 16) { - __m512i srcmm, zmm; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); - zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); - - // shifting elements so they start from the start of the word - zmm = _mm512_srlv_epi32(zmm, shiftMask); - zmm = _mm512_and_si512(zmm, parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector30(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 30; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t startBit = 0; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - bufRestByteLen = bufferEnd - bufferStart; - dstPtr += align; - numElements -= align; - } - } - - if (numElements >= 16) { - __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - __m512i maskmm = _mm512_set1_epi8(0x0F); - - __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable30u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable30u_1); - - __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable30u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable30u_1); - - __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable30u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable30u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable30u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable30u_3); - - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable30u); - - while (numElements >= 32) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); - - // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); - zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[2]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[3]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - if (numElements >= 16) { - __m512i srcmm, zmm[2]; - - srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); - - __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); - __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); - - srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); - - // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones - zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); - zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); - - // shifting elements so they start from the start of the word - zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); - zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); - - // gathering even and odd elements together - zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); - zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - - zmm[0] = _mm512_slli_epi32(zmm[0], 2u); - lowNibblemm = _mm512_and_si512(zmm[0], maskmm); - highNibblemm = _mm512_srli_epi16(zmm[0], 4u); - highNibblemm = _mm512_and_si512(highNibblemm, maskmm); - - lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); - highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); - lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); - - zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf32, zmm[0]); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } - - void RleDecoderV2::unrolledUnpackVector32(int64_t* data, uint64_t offset, uint64_t len) { - uint32_t bitWidth = 32; - const uint8_t* srcPtr = reinterpret_cast(bufferStart); - uint64_t numElements = 0; - int64_t* dstPtr = data + offset; - uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = bufferEnd - bufferStart; - bool resetBuf = false; - uint64_t tailBitLen = 0; - uint32_t backupByteLen = 0; - - while (len > 0) { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - } else { - numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (numElements >= 16) { - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); - while (numElements >= 16) { - __m512i srcmm = _mm512_loadu_si512(srcPtr); - srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u); - _mm512_storeu_si512(vectorBuf32, srcmm); - - srcPtr += 2 * bitWidth; - resetBufferStart(2 * bitWidth, false, 0); - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; - } - } - - if (numElements > 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - unrolledUnpack32(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(bufferStart); - dstPtr += numElements; - bufRestByteLen = bufferEnd - bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - ; - unrolledUnpack32(dstPtr, 0, 1); - dstPtr++; - backupByteLen = 0; - len--; - } else { - resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - } - - bufRestByteLen = bufferEnd - bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(bufferStart); - } - } -#endif - - void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. - while (bitsLeft > 0 && curIdx < offset + len) { - bitsLeft -= 4; - data[curIdx++] = (curByte >> bitsLeft) & 15; - } - if (curIdx == offset + len) return; - - // Exhaust the buffer - uint64_t numGroups = (offset + len - curIdx) / 2; - numGroups = std::min(numGroups, static_cast(bufferEnd - bufferStart)); - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - uint32_t localByte; - for (uint64_t i = 0; i < numGroups; ++i) { - localByte = *buffer++; - data[curIdx] = (localByte >> 4) & 15; - data[curIdx + 1] = localByte & 15; - curIdx += 2; - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // readByte() will update 'bufferStart' and 'bufferEnd' - curByte = readByte(); - bitsLeft = 8; - } - } - - void RleDecoderV2::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = bufferEnd - bufferStart; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - data[curIdx++] = *buffer++; - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // readByte() will update 'bufferStart' and 'bufferEnd'. - data[curIdx++] = readByte(); - } - } - - void RleDecoderV2::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 2; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint16_t b0, b1; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - buffer += 2; - data[curIdx++] = (b0 << 8) | b1; - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - data[curIdx++] = (b0 << 8) | b1; - } - } - - void RleDecoderV2::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 3; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint32_t b0, b1, b2; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - buffer += 3; - data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); - } - bufferStart += bufferNum * 3; - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); - } - } - - void RleDecoderV2::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 4; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint32_t b0, b1, b2, b3; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - buffer += 4; - data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); - } - } - - void RleDecoderV2::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 5; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - buffer += 5; - data[curIdx++] = - static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - } - } - - void RleDecoderV2::unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 6; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - buffer += 6; - data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | - (b4 << 8) | b5); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - data[curIdx++] = - static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); - } - } - - void RleDecoderV2::unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 7; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - b6 = static_cast(*(buffer + 6)); - buffer += 7; - data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | - (b4 << 16) | (b5 << 8) | b6); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | - (b4 << 16) | (b5 << 8) | b6); - } - } - - void RleDecoderV2::unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 8; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6, b7; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - b6 = static_cast(*(buffer + 6)); - b7 = static_cast(*(buffer + 7)); - buffer += 8; - data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | - (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - b7 = readByte(); - data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | - (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); - } - } - - void RleDecoderV2::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, - uint64_t& startBit) { - for (uint64_t i = offset; i < (offset + len); i++) { - uint64_t result = 0; - uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = readByte(); - bitsLeft = 8; - } - - // handle the left over bits - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= static_cast(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); - } - data[i] = static_cast(result); - startBit = bitsLeft == 0 ? 0 : (8 - bitsLeft); - } + int RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + static DynamicDispatch dispatch; + return dispatch.func(this, data, offset, len, fbs); } + /////// RleDecoderV2::RleDecoderV2(std::unique_ptr input, bool _isSigned, MemoryPool& pool, ReaderMetrics* _metrics) : RleDecoder(_metrics), + bufferStart(nullptr), + bufferEnd(bufferStart), + bitsLeft(0), + curByte(0), inputStream(std::move(input)), isSigned(_isSigned), firstByte(0), runLength(0), runRead(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - bitsLeft(0), - curByte(0), unpackedPatch(pool, 0), literals(pool, MAX_LITERAL_SIZE) { // PASS @@ -4763,7 +172,7 @@ namespace orc { if (runRead == runLength) { resetRun(); - firstByte = readByte(); + firstByte = readByte(&bufferStart, &bufferEnd); } uint64_t offset = nRead, length = numValues - nRead; @@ -4850,7 +259,7 @@ namespace orc { // extract the run length runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); + runLength |= readByte(&bufferStart, &bufferEnd); // runs are one off runLength += 1; runRead = 0; @@ -4899,13 +308,13 @@ namespace orc { // extract the run length runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); + runLength |= readByte(&bufferStart, &bufferEnd); // runs are one off runLength += 1; runRead = 0; // extract the number of bytes occupied by base - uint64_t thirdByte = readByte(); + uint64_t thirdByte = readByte(&bufferStart, &bufferEnd); uint64_t byteSize = (thirdByte >> 5) & 0x07; // base width is one off byteSize += 1; @@ -4915,7 +324,7 @@ namespace orc { uint32_t patchBitSize = decodeBitWidth(pwo); // read fourth byte and extract patch gap width - uint64_t fourthByte = readByte(); + uint64_t fourthByte = readByte(&bufferStart, &bufferEnd); uint32_t pgw = (fourthByte >> 5) & 0x07; // patch gap width is one off pgw += 1; @@ -5003,7 +412,7 @@ namespace orc { // extract the run length runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(); + runLength |= readByte(&bufferStart, &bufferEnd); ++runLength; // account for first value runRead = 0; diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 6eb8a82121..bd10440e6f 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -36,7 +36,7 @@ namespace orc { const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M - class RleVectorTest : public TestWithParam { + class RleV2BitUnpackAvx512Test : public TestWithParam { virtual void SetUp(); protected: @@ -71,7 +71,7 @@ namespace orc { delete[] decodedData; } - void RleVectorTest::SetUp() { + void RleV2BitUnpackAvx512Test::SetUp() { alignBitpacking = GetParam(); } @@ -113,9 +113,9 @@ namespace orc { fflush(stdout); } - std::unique_ptr RleVectorTest::getEncoder(RleVersion version, - MemoryOutputStream& memStream, - bool isSigned) { + std::unique_ptr RleV2BitUnpackAvx512Test::getEncoder(RleVersion version, + MemoryOutputStream& memStream, + bool isSigned) { MemoryPool* pool = getDefaultPool(); return createRleEncoder(std::unique_ptr(new BufferedOutputStream( @@ -123,9 +123,9 @@ namespace orc { isSigned, version, *pool, alignBitpacking); } - void RleVectorTest::runTest(RleVersion version, uint64_t numValues, int64_t start, int64_t delta, - bool random, bool isSigned, uint8_t bitWidth, uint64_t blockSize, - uint64_t numNulls) { + void RleV2BitUnpackAvx512Test::runTest(RleVersion version, uint64_t numValues, int64_t start, + int64_t delta, bool random, bool isSigned, + uint8_t bitWidth, uint64_t blockSize, uint64_t numNulls) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); std::unique_ptr encoder = getEncoder(version, memStream, isSigned); @@ -142,7 +142,7 @@ namespace orc { } #if defined(ORC_HAVE_RUNTIME_AVX512) - TEST_P(RleVectorTest, RleV2_basic_vector_decode_1bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_1bit) { uint8_t bitWidth = 1; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -159,7 +159,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_2bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_2bit) { uint8_t bitWidth = 2; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -176,7 +176,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_3bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_3bit) { uint8_t bitWidth = 3; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -193,7 +193,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_4bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_4bit) { uint8_t bitWidth = 4; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -210,7 +210,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_5bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_5bit) { uint8_t bitWidth = 5; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -227,7 +227,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_6bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_6bit) { uint8_t bitWidth = 6; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -244,7 +244,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_7bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_7bit) { uint8_t bitWidth = 7; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -261,7 +261,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_9bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_9bit) { uint8_t bitWidth = 9; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { @@ -279,7 +279,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_10bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_10bit) { uint8_t bitWidth = 10; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -296,7 +296,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_11bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_11bit) { uint8_t bitWidth = 11; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -313,7 +313,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_12bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_12bit) { uint8_t bitWidth = 12; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -330,7 +330,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_13bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_13bit) { uint8_t bitWidth = 13; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -347,7 +347,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_14bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_14bit) { uint8_t bitWidth = 14; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -364,7 +364,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_15bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_15bit) { uint8_t bitWidth = 15; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -381,7 +381,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_16bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_16bit) { uint8_t bitWidth = 16; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -398,7 +398,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_17bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_17bit) { uint8_t bitWidth = 17; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -415,7 +415,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_18bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_18bit) { uint8_t bitWidth = 18; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -432,7 +432,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_19bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_19bit) { uint8_t bitWidth = 19; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -449,7 +449,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_20bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_20bit) { uint8_t bitWidth = 20; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -466,7 +466,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_21bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_21bit) { uint8_t bitWidth = 21; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -483,7 +483,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_22bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_22bit) { uint8_t bitWidth = 22; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -500,7 +500,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_23bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_23bit) { uint8_t bitWidth = 23; runTest(RleVersion_2, 3277, 0, 0, true, false, bitWidth, 108); for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { @@ -518,7 +518,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_24bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_24bit) { uint8_t bitWidth = 24; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -535,7 +535,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_26bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_26bit) { uint8_t bitWidth = 26; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -552,7 +552,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_28bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_28bit) { uint8_t bitWidth = 28; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -569,7 +569,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_30bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_30bit) { uint8_t bitWidth = 30; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -586,7 +586,7 @@ namespace orc { printf("\n"); } - TEST_P(RleVectorTest, RleV2_basic_vector_decode_32bit) { + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_32bit) { uint8_t bitWidth = 32; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); @@ -603,6 +603,6 @@ namespace orc { printf("\n"); } - INSTANTIATE_TEST_SUITE_P(OrcTest, RleVectorTest, Values(true, false)); + INSTANTIATE_TEST_SUITE_P(OrcTest, RleV2BitUnpackAvx512Test, Values(true, false)); #endif } // namespace orc From 6bc9035eec65e69672f24bb55dbd5b48dbc6a10b Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 13 Feb 2023 18:14:06 -0500 Subject: [PATCH 22/80] Delete some comments in code. --- CMakeLists.txt | 79 ++++---- c++/src/BpackingAvx512.cc | 22 +-- c++/src/BpackingAvx512.hh | 17 +- c++/src/BpackingDefault.cc | 28 +-- c++/src/BpackingDefault.hh | 15 -- c++/src/CMakeLists.txt | 374 +++++++++++++++++++------------------ c++/src/RLEv2.hh | 31 --- c++/src/RleDecoderV2.cc | 4 - 8 files changed, 260 insertions(+), 310 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6ac39ddb83..fa988e136f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,14 +1,14 @@ -#Licensed under the Apache License, Version 2.0(the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -#http: // www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. cmake_minimum_required (VERSION 3.12.0) if (POLICY CMP0048) @@ -20,13 +20,12 @@ endif () project(ORC C CXX) -#Version number of package +# Version number of package SET(CPACK_PACKAGE_VERSION_MAJOR "1") SET(CPACK_PACKAGE_VERSION_MINOR "9") SET(CPACK_PACKAGE_VERSION_PATCH "0-SNAPSHOT") SET(ORC_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") -set(CMAKE_MODULE_PATH ${ - CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake_modules") +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake_modules") option (BUILD_JAVA "Include ORC Java library in the build process" @@ -72,7 +71,7 @@ option(BUILD_CPP_AVX512 "Enable build with AVX512 at compile time" ON) -#Make sure that a build type is selected +# Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) message(STATUS "No build type selected, default to ReleaseWithDebugInfo") set (CMAKE_BUILD_TYPE "RELWITHDEBINFO") @@ -80,7 +79,7 @@ else () message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") endif () -#Set the package format +# Set the package format SET(CPACK_GENERATOR "TGZ") SET(CPACK_PACKAGE_VENDOR "Apache ORC") SET(CPACK_PACKAGE_CONTACT "Apache ORC ") @@ -104,15 +103,15 @@ if(NOT DEFINED ORC_RUNTIME_SIMD_LEVEL) endif() # -#Compiler specific flags +# Compiler specific flags # -#This ensures that things like c++ 17 get passed correctly +# This ensures that things like c++17 get passed correctly if(NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) elseif(${CMAKE_CXX_STANDARD} VERSION_LESS 17) message(FATAL_ERROR "Cannot set a CMAKE_CXX_STANDARD smaller than 17") endif() -#We require a C++ 17 compliant compiler +# We require a C++17 compliant compiler set(CMAKE_CXX_STANDARD_REQUIRED ON) if (NOT MSVC) set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-omit-frame-pointer") @@ -195,9 +194,9 @@ if(NOT DEFINED ORC_CPU_FLAG) endif() endif() -#Check architecture specific compiler flags +# Check architecture specific compiler flags if(ORC_CPU_FLAG STREQUAL "x86") -#x86 / amd64 compiler flags, msvc / gcc / clang + # x86/amd64 compiler flags, msvc/gcc/clang if(MSVC) set(ORC_SSE4_2_FLAG "") set(ORC_AVX2_FLAG "/arch:AVX2") @@ -206,7 +205,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") else() set(ORC_SSE4_2_FLAG "-msse4.2") set(ORC_AVX2_FLAG "-march=haswell") -#skylake - avx512 consists of AVX512F, AVX512BW, AVX512VL, AVX512CD, AVX512DQ + # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ set(ORC_AVX512_FLAG "-march=native -mbmi2") set(ORC_AVX2_FLAG "${ORC_AVX2_FLAG} -mavx2") set(ORC_AVX512_FLAG @@ -214,29 +213,29 @@ if(ORC_CPU_FLAG STREQUAL "x86") endif() check_cxx_compiler_flag(${ORC_AVX512_FLAG} CXX_SUPPORTS_AVX512) if(MINGW) -#https: // gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 message(STATUS "Disable AVX512 support on MINGW for now") else() -#Check for AVX512 support in the compiler. + # Check for AVX512 support in the compiler. set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ORC_AVX512_FLAG}") check_cxx_source_compiles(" -#ifdef _MSC_VER -#include -#else -#include -#endif + #ifdef _MSC_VER + #include + #else + #include + #endif int main() { - __m512i mask = _mm512_set1_epi32(0x1); - char out[32]; - _mm512_storeu_si512(out, mask); - return 0; + __m512i mask = _mm512_set1_epi32(0x1); + char out[32]; + _mm512_storeu_si512(out, mask); + return 0; }" CXX_SUPPORTS_AVX512) set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) endif() -#Runtime SIMD level it can get from compiler and ORC_RUNTIME_SIMD_LEVEL + # Runtime SIMD level it can get from compiler and ORC_RUNTIME_SIMD_LEVEL if(CXX_SUPPORTS_SSE4_2 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(SSE4_2|AVX2|AVX512|MAX)$") set(ORC_HAVE_RUNTIME_SSE4_2 ON) @@ -260,14 +259,14 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(ORC_SIMD_LEVEL "NONE") endif() elseif(ORC_CPU_FLAG STREQUAL "ppc") -#power compiler flags, gcc / clang only + # power compiler flags, gcc/clang only set(ORC_ALTIVEC_FLAG "-maltivec") check_cxx_compiler_flag(${ORC_ALTIVEC_FLAG} CXX_SUPPORTS_ALTIVEC) if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") set(ORC_SIMD_LEVEL "NONE") endif() elseif(ORC_CPU_FLAG STREQUAL "aarch64") -#Arm64 compiler flags, gcc / clang only + # Arm64 compiler flags, gcc/clang only set(ORC_ARMV8_MARCH "armv8-a") check_cxx_compiler_flag("-march=${ORC_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE) if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") @@ -275,10 +274,10 @@ elseif(ORC_CPU_FLAG STREQUAL "aarch64") endif() endif() -#Only enable additional instruction sets if they are supported +# Only enable additional instruction sets if they are supported if(ORC_CPU_FLAG STREQUAL "x86") if(MINGW) -#Enable _xgetbv() intrinsic to query OS support for ZMM register saves + # Enable _xgetbv() intrinsic to query OS support for ZMM register saves set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mxsave") endif() if(ORC_SIMD_LEVEL STREQUAL "AVX512") @@ -335,17 +334,15 @@ if (BUILD_TOOLS) endif () if (BUILD_CPP_TESTS) -#Add another target called test - out that prints the results on failure + # Add another target called test-out that prints the results on failure if (CMAKE_CONFIGURATION_TYPES) add_custom_target (test-out - COMMAND ${ - CMAKE_CTEST_COMMAND} --force-new-ctest-process + COMMAND ${CMAKE_CTEST_COMMAND} --force-new-ctest-process --output-on-failure --build-config "$" ) else () add_custom_target (test-out - COMMAND ${ - CMAKE_CTEST_COMMAND} --force-new-ctest-process + COMMAND ${CMAKE_CTEST_COMMAND} --force-new-ctest-process --output-on-failure ) endif () diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index fbbda92c55..5faab97a26 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -25,9 +25,7 @@ namespace orc { #if defined(ORC_HAVE_RUNTIME_AVX512) UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), - unpackDefault(UnpackDefault(dec)), - bitsLeft(decoder->bitsLeft), - curByte(decoder->curByte) { + unpackDefault(UnpackDefault(dec)) { // PASS } @@ -4298,22 +4296,22 @@ namespace orc { for (uint64_t i = offset; i < (offset + len); i++) { uint64_t result = 0; uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - bitsLeft = 8; + while (bitsLeftToRead > decoder->bitsLeft) { + result <<= decoder->bitsLeft; + result |= decoder->curByte & ((1 << decoder->bitsLeft) - 1); + bitsLeftToRead -= decoder->bitsLeft; + decoder->curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + decoder->bitsLeft = 8; } // handle the left over bits if (bitsLeftToRead > 0) { result <<= bitsLeftToRead; - bitsLeft -= static_cast(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + decoder->bitsLeft -= static_cast(bitsLeftToRead); + result |= (decoder->curByte >> decoder->bitsLeft) & ((1 << bitsLeftToRead) - 1); } data[i] = static_cast(result); - startBit = bitsLeft == 0 ? 0 : (8 - bitsLeft); + startBit = decoder->bitsLeft == 0 ? 0 : (8 - decoder->bitsLeft); } } #endif diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index e177210388..77daa9eecf 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -75,17 +75,12 @@ namespace orc { RleDecoderV2* decoder; UnpackDefault unpackDefault; - // char* bufferStart; - // char* bufferEnd; - uint32_t bitsLeft; - uint32_t curByte; - - uint8_t - vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data - uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + - 1]; // Used by vectorially 9~16 bit-unpacking data - uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + - 1]; // Used by vectorially 17~32 bit-unpacking data + // Used by vectorially 1~8 bit-unpacking data + uint8_t vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; + // Used by vectorially 9~16 bit-unpacking data + uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + 1]; + // Used by vectorially 17~32 bit-unpacking data + uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + 1]; }; #endif diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc index 821188b10f..4d3ddea1ed 100644 --- a/c++/src/BpackingDefault.cc +++ b/c++/src/BpackingDefault.cc @@ -22,7 +22,7 @@ namespace orc { UnpackDefault::UnpackDefault(RleDecoderV2* dec) - : decoder(dec), bitsLeft(decoder->bitsLeft), curByte(decoder->curByte) { + : decoder(dec) { // PASS } @@ -34,9 +34,9 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. - while (bitsLeft > 0 && curIdx < offset + len) { - bitsLeft -= 4; - data[curIdx++] = (curByte >> bitsLeft) & 15; + while (decoder->bitsLeft > 0 && curIdx < offset + len) { + decoder->bitsLeft -= 4; + data[curIdx++] = (decoder->curByte >> decoder->bitsLeft) & 15; } if (curIdx == offset + len) return; @@ -57,8 +57,8 @@ namespace orc { if (curIdx == offset + len) return; // readByte() will update 'bufferStart' and 'bufferEnd' - curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - bitsLeft = 8; + decoder->curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + decoder->bitsLeft = 8; } } @@ -316,19 +316,19 @@ namespace orc { for (uint64_t i = offset; i < (offset + len); i++) { uint64_t result = 0; uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - bitsLeft = 8; + while (bitsLeftToRead > decoder->bitsLeft) { + result <<= decoder->bitsLeft; + result |= decoder->curByte & ((1 << decoder->bitsLeft) - 1); + bitsLeftToRead -= decoder->bitsLeft; + decoder->curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + decoder->bitsLeft = 8; } // handle the left over bits if (bitsLeftToRead > 0) { result <<= bitsLeftToRead; - bitsLeft -= static_cast(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + decoder->bitsLeft -= static_cast(bitsLeftToRead); + result |= (decoder->curByte >> decoder->bitsLeft) & ((1 << bitsLeftToRead) - 1); } data[i] = static_cast(result); } diff --git a/c++/src/BpackingDefault.hh b/c++/src/BpackingDefault.hh index c3d37c4ee8..89dcc24b1b 100644 --- a/c++/src/BpackingDefault.hh +++ b/c++/src/BpackingDefault.hh @@ -22,7 +22,6 @@ #include #include -// #include "Adaptor.hh" #include "RLEv2.hh" #include "io/InputStream.hh" #include "io/OutputStream.hh" @@ -46,22 +45,8 @@ namespace orc { void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); - /* void setBuf(char* bufStart, char* bufEnd) { - bufferStart = bufStart; - bufferEnd = bufEnd; - } - - void getBuf(char** bufStart, char** bufEnd) { - *bufStart = bufferStart; - *bufEnd = bufferEnd; - }*/ - private: RleDecoderV2* decoder; - // char* bufferStart; - // char* bufferEnd; - uint32_t bitsLeft; - uint32_t curByte; }; } // namespace orc diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index 6aa6c1f1cc..5dc217c384 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -1,103 +1,110 @@ -#Licensed under the Apache License, Version 2.0(the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -#http: // www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") - INCLUDE(CheckCXXSourceCompiles) +INCLUDE(CheckCXXSourceCompiles) - CHECK_CXX_SOURCE_COMPILES(" -#include -#include - int main(int, char*[]) { +CHECK_CXX_SOURCE_COMPILES(" + #include + #include + int main(int,char*[]){ int f = open(\"/x/y\", O_RDONLY); char buf[100]; return pread(f, buf, 100, 1000) == 0; - } " - HAS_PREAD) - - CHECK_CXX_SOURCE_COMPILES(" -#include - int main(int, char*[]) { - struct tm time2020; + }" + HAS_PREAD +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int,char*[]){ + struct tm time2020; return !strptime(\"2020-02-02 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2020); - } " - HAS_STRPTIME) - - CHECK_CXX_SOURCE_COMPILES(" - int main() { - int a; - return __builtin_add_overflow(1, 2, &a); - } " - HAS_BUILTIN_OVERFLOW_CHECK) - - CHECK_CXX_SOURCE_COMPILES(" -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored \"-Wdeprecated\" -#pragma clang diagnostic pop -#elif defined(__GNUC__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored \"-Wdeprecated\" -#pragma GCC diagnostic pop -#elif defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4996) -#pragma warning(pop) -#else + }" + HAS_STRPTIME +) + +CHECK_CXX_SOURCE_COMPILES(" + int main(){ + int a; + return __builtin_add_overflow(1, 2, &a); + }" + HAS_BUILTIN_OVERFLOW_CHECK +) + +CHECK_CXX_SOURCE_COMPILES(" + #ifdef __clang__ + #pragma clang diagnostic push + #pragma clang diagnostic ignored \"-Wdeprecated\" + #pragma clang diagnostic pop + #elif defined(__GNUC__) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored \"-Wdeprecated\" + #pragma GCC diagnostic pop + #elif defined(_MSC_VER) + #pragma warning( push ) + #pragma warning( disable : 4996 ) + #pragma warning( pop ) + #else unknownCompiler! -#endif - int main(int, char*[]) {} " - HAS_DIAGNOSTIC_PUSH) - - CHECK_CXX_SOURCE_COMPILES(" -#include - int main(int, char*[]) { - return std::isnan(1.0f); - } " - HAS_STD_ISNAN) - - CHECK_CXX_SOURCE_COMPILES(" -#include - int main(int, char*[]) { - double d = 5; - std::to_string(d); - } " - HAS_DOUBLE_TO_STRING) - - CHECK_CXX_SOURCE_COMPILES(" -#include -#include - int main(int, char*[]) { - int64_t d = 5; - std::to_string(d); - } " - HAS_INT64_TO_STRING) - - INCLUDE(CheckCXXSourceRuns) - - CHECK_CXX_SOURCE_RUNS( - " -#include - int main(int, char*[]) { - time_t t = -14210715; // 1969-07-20 12:34:45 - struct tm* ptm = gmtime(&t); - return !(ptm && ptm->tm_year == 69); - } " - HAS_PRE_1970) - - CHECK_CXX_SOURCE_RUNS(" -#include -#include - int main(int, char*[]) { + #endif + int main(int, char *[]) {}" + HAS_DIAGNOSTIC_PUSH +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int, char *[]) { + return std::isnan(1.0f); + }" + HAS_STD_ISNAN +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + int main(int, char *[]) { + double d = 5; + std::to_string(d); + }" + HAS_DOUBLE_TO_STRING +) + +CHECK_CXX_SOURCE_COMPILES(" + #include + #include + int main(int, char *[]) { + int64_t d = 5; + std::to_string(d); + }" + HAS_INT64_TO_STRING +) + +INCLUDE(CheckCXXSourceRuns) + +CHECK_CXX_SOURCE_RUNS(" + #include + int main(int, char *[]) { + time_t t = -14210715; // 1969-07-20 12:34:45 + struct tm *ptm = gmtime(&t); + return !(ptm && ptm->tm_year == 69); + }" + HAS_PRE_1970 +) + +CHECK_CXX_SOURCE_RUNS(" + #include + #include + int main(int, char *[]) { setenv(\"TZ\", \"America/Los_Angeles\", 1); tzset(); struct tm time2037; @@ -105,95 +112,98 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") strptime(\"2037-05-05 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2037); strptime(\"2038-05-05 12:34:56\", \"%Y-%m-%d %H:%M:%S\", &time2038); return (mktime(&time2038) - mktime(&time2037)) <= 31500000; - } " - HAS_POST_2038) - - set(CMAKE_REQUIRED_INCLUDES ${ZLIB_INCLUDE_DIR}) set( - CMAKE_REQUIRED_LIBRARIES - orc_zlib) CHECK_CXX_SOURCE_COMPILES(" -#define Z_PREFIX -#include - z_stream - strm; - int main( - int, - char*[]) { - deflateReset( - &strm); - } " - NEEDS_Z_PREFIX) - - configure_file( - "Adaptor.hh.in" - "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh") - - include_directories(${ - CMAKE_CURRENT_SOURCE_DIR} ${ - CMAKE_CURRENT_BINARY_DIR} ${ - LIBHDFSPP_INCLUDE_DIR}) - - add_custom_command( - OUTPUT orc_proto.pb.h orc_proto.pb - .cc COMMAND ${ - PROTOBUF_EXECUTABLE} - - I ${PROJECT_SOURCE_DIR} / - proto-- cpp_out = - "${CMAKE_CURRENT_BINARY_DIR}" - "${PROJECT_SOURCE_DIR}/proto/" - "orc_proto.proto") - - set(SOURCE_FILES - "${CMAKE_CURRENT_BINARY_" - "DIR}/Adaptor.hh" orc_proto - .pb.h io / - InputStream.cc io / - OutputStream.cc sargs / - ExpressionTree.cc sargs / - Literal.cc sargs / - PredicateLeaf.cc sargs / - SargsApplier.cc sargs / - SearchArgument.cc sargs / - TruthValue.cc wrap / orc - - proto - - wrapper.cc Adaptor - .cc BlockBuffer - .cc BloomFilter.cc ByteRLE - .cc ColumnPrinter - .cc ColumnReader - .cc ColumnWriter.cc Common - .cc Compression - .cc Exceptions.cc Int128 - .cc LzoDecompressor - .cc MemoryPool.cc Murmur3 - .cc OrcFile.cc Reader - .cc RLEv1.cc RLEV2Util - .cc RleDecoderV2 - .cc RleEncoderV2.cc RLE - .cc Statistics - .cc StripeStream.cc Timezone - .cc TypeImpl.cc Vector - .cc Writer.cc CpuInfoUtil - .cc BpackingDefault - .cc BpackingAvx512 - .cc Bpacking.cc) - - if (BUILD_LIBHDFSPP) set( - SOURCE_FILES ${ - SOURCE_FILES} OrcHdfsFile - .cc) add_definitions(-DBUILD_LIBHDFSPP) - endif(BUILD_LIBHDFSPP) - - add_library( - orc STATIC ${ - SOURCE_FILES}) - - target_link_libraries( - orc orc::protobuf - orc::zlib orc::snappy - orc::lz4 orc::zstd - ${LIBHDFSPP_LIBRARIES}) - - install( - TARGETS orc - DESTINATION - lib) + }" + HAS_POST_2038 +) + +set(CMAKE_REQUIRED_INCLUDES ${ZLIB_INCLUDE_DIR}) +set(CMAKE_REQUIRED_LIBRARIES orc_zlib) +CHECK_CXX_SOURCE_COMPILES(" + #define Z_PREFIX + #include + z_stream strm; + int main(int, char *[]) { + deflateReset(&strm); + }" + NEEDS_Z_PREFIX +) + +configure_file ( + "Adaptor.hh.in" + "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh" + ) + +include_directories ( + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} + ${LIBHDFSPP_INCLUDE_DIR} + ) + +add_custom_command(OUTPUT orc_proto.pb.h orc_proto.pb.cc + COMMAND ${PROTOBUF_EXECUTABLE} + -I ${PROJECT_SOURCE_DIR}/proto + --cpp_out="${CMAKE_CURRENT_BINARY_DIR}" + "${PROJECT_SOURCE_DIR}/proto/orc_proto.proto" +) + +set(SOURCE_FILES + "${CMAKE_CURRENT_BINARY_DIR}/Adaptor.hh" + orc_proto.pb.h + io/InputStream.cc + io/OutputStream.cc + sargs/ExpressionTree.cc + sargs/Literal.cc + sargs/PredicateLeaf.cc + sargs/SargsApplier.cc + sargs/SearchArgument.cc + sargs/TruthValue.cc + wrap/orc-proto-wrapper.cc + Adaptor.cc + BlockBuffer.cc + BloomFilter.cc + ByteRLE.cc + ColumnPrinter.cc + ColumnReader.cc + ColumnWriter.cc + Common.cc + Compression.cc + Exceptions.cc + Int128.cc + LzoDecompressor.cc + MemoryPool.cc + Murmur3.cc + OrcFile.cc + Reader.cc + RLEv1.cc + RLEV2Util.cc + RleDecoderV2.cc + RleEncoderV2.cc + RLE.cc + Statistics.cc + StripeStream.cc + Timezone.cc + TypeImpl.cc + Vector.cc + Writer.cc + CpuInfoUtil.cc + BpackingDefault.cc + BpackingAvx512.cc + Bpacking.cc) + +if(BUILD_LIBHDFSPP) + set(SOURCE_FILES ${SOURCE_FILES} OrcHdfsFile.cc) + add_definitions(-DBUILD_LIBHDFSPP) +endif(BUILD_LIBHDFSPP) + +add_library (orc STATIC ${SOURCE_FILES}) + +target_link_libraries (orc + orc::protobuf + orc::zlib + orc::snappy + orc::lz4 + orc::zstd + ${LIBHDFSPP_LIBRARIES} + ) + +install(TARGETS orc DESTINATION lib) diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index 6dac427c9e..ba5dd0d81a 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -22,15 +22,9 @@ #include "Adaptor.hh" #include "RLE.hh" #include "orc/Exceptions.hh" -// #include "BpackingDefault.hh" -// #include "Bpacking.hh" #include -#define MAX_VECTOR_BUF_8BIT_LENGTH 64 -#define MAX_VECTOR_BUF_16BIT_LENGTH 32 -#define MAX_VECTOR_BUF_32BIT_LENGTH 16 - #define MAX_LITERAL_SIZE 512 #define MIN_REPEAT 3 #define HIST_LEN 32 @@ -172,23 +166,14 @@ namespace orc { void next(int16_t* data, uint64_t numValues, const char* notNull) override; - // unsigned char readByte(char** bufStart, char** bufEnd); void resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, bool resetBuf, uint32_t backupLen); - char* getBufferStart() { - return bufferStart; - } - - char* getBufferEnd() { - return bufferEnd; - } char* bufferStart; char* bufferEnd; uint32_t bitsLeft; // Used by readLongs when bitSize < 8 uint32_t curByte; // Used by anything that uses readLongs - // private: /** @@ -213,9 +198,6 @@ namespace orc { resetReadLongs(); } - // void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); - // unsigned char readByte(); - int64_t readLongBE(uint64_t bsz); int64_t readVslong(); uint64_t readVulong(); @@ -250,21 +232,8 @@ namespace orc { unsigned char firstByte; uint64_t runLength; // Length of the current run uint64_t runRead; // Number of returned values of the current run - // char* bufferStart; - // char* bufferEnd; - // uint32_t bitsLeft; // Used by readLongs when bitSize < 8 - // uint32_t curByte; // Used by anything that uses readLongs DataBuffer unpackedPatch; // Used by PATCHED_BASE DataBuffer literals; // Values of the current run - -#if defined(ORC_HAVE_RUNTIME_AVX512) - uint8_t - vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; // Used by vectorially 1~8 bit-unpacking data - uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + - 1]; // Used by vectorially 9~16 bit-unpacking data - uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + - 1]; // Used by vectorially 17~32 bit-unpacking data -#endif }; } // namespace orc diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index c8c2082e61..a1ea19619e 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -18,7 +18,6 @@ #include "Adaptor.hh" #include "Compression.hh" -// #include "DetectPlatform.hh" #include "Bpacking.hh" #include "Dispatch.hh" #include "RLEV2Util.hh" @@ -26,7 +25,6 @@ #include "Utils.hh" #if defined(ORC_HAVE_RUNTIME_AVX512) #include "BpackingAvx512.hh" -// #include "BitUnpackerAvx512.hh" #endif namespace orc { @@ -97,7 +95,6 @@ namespace orc { return ret; } - /////// struct UnpackDynamicFunction { using FunctionType = decltype(&readLongsDefault); @@ -115,7 +112,6 @@ namespace orc { static DynamicDispatch dispatch; return dispatch.func(this, data, offset, len, fbs); } - /////// RleDecoderV2::RleDecoderV2(std::unique_ptr input, bool _isSigned, MemoryPool& pool, ReaderMetrics* _metrics) From ca3af78cbd734a9dcdd576e58b6fc3d04f5bc343 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 13 Feb 2023 22:53:56 -0500 Subject: [PATCH 23/80] Fix some comments. --- CMakeLists.txt | 53 +++++------------------------ c++/src/Bpacking.cc | 54 ++++++++++++++--------------- c++/src/BpackingAvx512.cc | 58 +++++++++++++++----------------- c++/src/BpackingAvx512.hh | 54 ++++++++++++++--------------- c++/src/BpackingDefault.cc | 40 +++++++++------------- c++/src/CpuInfoUtil.cc | 44 ++---------------------- c++/src/RLEv2.hh | 17 ++-------- c++/src/RleDecoderV2.cc | 2 +- c++/test/TestRleVectorDecoder.cc | 5 +-- 9 files changed, 112 insertions(+), 215 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fa988e136f..fcb1c2a90d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,7 +67,7 @@ option(BUILD_CPP_ENABLE_METRICS "Enable the metrics collection at compile phase" OFF) -option(BUILD_CPP_AVX512 +option(BUILD_ENABLE_AVX512 "Enable build with AVX512 at compile time" ON) @@ -183,12 +183,6 @@ if(NOT DEFINED ORC_CPU_FLAG) set(ORC_CPU_FLAG "aarch64") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm$|armv[4-7]") set(ORC_CPU_FLAG "aarch32") - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "powerpc|ppc") - set(ORC_CPU_FLAG "ppc") - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") - set(ORC_CPU_FLAG "s390x") - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64") - set(ORC_CPU_FLAG "riscv64") else() message(FATAL_ERROR "Unknown system processor") endif() @@ -198,16 +192,11 @@ endif() if(ORC_CPU_FLAG STREQUAL "x86") # x86/amd64 compiler flags, msvc/gcc/clang if(MSVC) - set(ORC_SSE4_2_FLAG "") - set(ORC_AVX2_FLAG "/arch:AVX2") set(ORC_AVX512_FLAG "/arch:AVX512") set(CXX_SUPPORTS_SSE4_2 TRUE) else() - set(ORC_SSE4_2_FLAG "-msse4.2") - set(ORC_AVX2_FLAG "-march=haswell") # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ set(ORC_AVX512_FLAG "-march=native -mbmi2") - set(ORC_AVX2_FLAG "${ORC_AVX2_FLAG} -mavx2") set(ORC_AVX512_FLAG "${ORC_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi") endif() @@ -235,36 +224,22 @@ if(ORC_CPU_FLAG STREQUAL "x86") CXX_SUPPORTS_AVX512) set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) endif() + + message(STATUS "BUILD_ENABLE_AVX512=${BUILD_ENABLE_AVX512}") + message(STATUS "CXX_SUPPORTS_AVX512=${CXX_SUPPORTS_AVX512}") + message(STATUS "ORC_RUNTIME_SIMD_LEVEL=${ORC_RUNTIME_SIMD_LEVEL}") # Runtime SIMD level it can get from compiler and ORC_RUNTIME_SIMD_LEVEL - if(CXX_SUPPORTS_SSE4_2 AND ORC_RUNTIME_SIMD_LEVEL MATCHES - "^(SSE4_2|AVX2|AVX512|MAX)$") - set(ORC_HAVE_RUNTIME_SSE4_2 ON) - set(ORC_SIMD_LEVEL "SSE4_2") - add_definitions(-DORC_HAVE_RUNTIME_SSE4_2) - endif() - if(CXX_SUPPORTS_AVX2 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(AVX2|AVX512|MAX)$") - set(ORC_HAVE_RUNTIME_AVX2 ON) - set(ORC_SIMD_LEVEL "AVX2") - add_definitions(-DORC_HAVE_RUNTIME_AVX2 -DORC_HAVE_RUNTIME_BMI2) - endif() if(BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(AVX512|MAX)$") message(STATUS "Enable the AVX512 vector decode of bit-packing") set(ORC_HAVE_RUNTIME_AVX512 ON) set(ORC_SIMD_LEVEL "AVX512") - add_definitions(-DORC_HAVE_RUNTIME_AVX512 -DORC_HAVE_RUNTIME_BMI2) + add_definitions(-DORC_HAVE_RUNTIME_AVX512) else () message(STATUS "Disable the AVX512 vector decode of bit-packing") endif() if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") set(ORC_SIMD_LEVEL "NONE") endif() -elseif(ORC_CPU_FLAG STREQUAL "ppc") - # power compiler flags, gcc/clang only - set(ORC_ALTIVEC_FLAG "-maltivec") - check_cxx_compiler_flag(${ORC_ALTIVEC_FLAG} CXX_SUPPORTS_ALTIVEC) - if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") - set(ORC_SIMD_LEVEL "NONE") - endif() elseif(ORC_CPU_FLAG STREQUAL "aarch64") # Arm64 compiler flags, gcc/clang only set(ORC_ARMV8_MARCH "armv8-a") @@ -273,6 +248,8 @@ elseif(ORC_CPU_FLAG STREQUAL "aarch64") set(ORC_SIMD_LEVEL "NEON") endif() endif() +message(STATUS "ORC_HAVE_RUNTIME_AVX512=${ORC_HAVE_RUNTIME_AVX512}") +message(STATUS "ORC_SIMD_LEVEL=${ORC_SIMD_LEVEL}") # Only enable additional instruction sets if they are supported if(ORC_CPU_FLAG STREQUAL "x86") @@ -285,20 +262,6 @@ if(ORC_CPU_FLAG STREQUAL "x86") message(FATAL_ERROR "AVX512 required but compiler doesn't support it.") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_AVX512_FLAG}") - add_definitions(-DORC_HAVE_AVX512 -DORC_HAVE_AVX2 -DORC_HAVE_BMI2 - -DORC_HAVE_SSE4_2) - elseif(ORC_SIMD_LEVEL STREQUAL "AVX2") - if(NOT CXX_SUPPORTS_AVX2) - message(FATAL_ERROR "AVX2 required but compiler doesn't support it.") - endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_AVX2_FLAG}") - add_definitions(-DORC_HAVE_AVX2 -DORC_HAVE_BMI2 -DORC_HAVE_SSE4_2) - elseif(ORC_SIMD_LEVEL STREQUAL "SSE4_2") - if(NOT CXX_SUPPORTS_SSE4_2) - message(FATAL_ERROR "SSE4.2 required but compiler doesn't support it.") - endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_SSE4_2_FLAG}") - add_definitions(-DORC_HAVE_SSE4_2) elseif(NOT ORC_SIMD_LEVEL STREQUAL "NONE") message(WARNING "ORC_SIMD_LEVEL=${ORC_SIMD_LEVEL} not supported by x86.") endif() diff --git a/c++/src/Bpacking.cc b/c++/src/Bpacking.cc index 39993d2d52..5c37b6f195 100644 --- a/c++/src/Bpacking.cc +++ b/c++/src/Bpacking.cc @@ -70,88 +70,88 @@ namespace orc { if (cpu_info->IsSupported(CpuInfo::AVX512)) { switch (fbs) { case 1: - unpackAvx512.unrolledUnpackVector1(data, offset, len); + unpackAvx512.vectorUnpack1(data, offset, len); break; case 2: - unpackAvx512.unrolledUnpackVector2(data, offset, len); + unpackAvx512.vectorUnpack2(data, offset, len); break; case 3: - unpackAvx512.unrolledUnpackVector3(data, offset, len); + unpackAvx512.vectorUnpack3(data, offset, len); break; case 4: - unpackAvx512.unrolledUnpackVector4(data, offset, len); + unpackAvx512.vectorUnpack4(data, offset, len); break; case 5: - unpackAvx512.unrolledUnpackVector5(data, offset, len); + unpackAvx512.vectorUnpack5(data, offset, len); break; case 6: - unpackAvx512.unrolledUnpackVector6(data, offset, len); + unpackAvx512.vectorUnpack6(data, offset, len); break; case 7: - unpackAvx512.unrolledUnpackVector7(data, offset, len); + unpackAvx512.vectorUnpack7(data, offset, len); break; case 8: unpackDefault.unrolledUnpack8(data, offset, len); break; case 9: - unpackAvx512.unrolledUnpackVector9(data, offset, len); + unpackAvx512.vectorUnpack9(data, offset, len); break; case 10: - unpackAvx512.unrolledUnpackVector10(data, offset, len); + unpackAvx512.vectorUnpack10(data, offset, len); break; case 11: - unpackAvx512.unrolledUnpackVector11(data, offset, len); + unpackAvx512.vectorUnpack11(data, offset, len); break; case 12: - unpackAvx512.unrolledUnpackVector12(data, offset, len); + unpackAvx512.vectorUnpack12(data, offset, len); break; case 13: - unpackAvx512.unrolledUnpackVector13(data, offset, len); + unpackAvx512.vectorUnpack13(data, offset, len); break; case 14: - unpackAvx512.unrolledUnpackVector14(data, offset, len); + unpackAvx512.vectorUnpack14(data, offset, len); break; case 15: - unpackAvx512.unrolledUnpackVector15(data, offset, len); + unpackAvx512.vectorUnpack15(data, offset, len); break; case 16: - unpackAvx512.unrolledUnpackVector16(data, offset, len); + unpackAvx512.vectorUnpack16(data, offset, len); break; case 17: - unpackAvx512.unrolledUnpackVector17(data, offset, len); + unpackAvx512.vectorUnpack17(data, offset, len); break; case 18: - unpackAvx512.unrolledUnpackVector18(data, offset, len); + unpackAvx512.vectorUnpack18(data, offset, len); break; case 19: - unpackAvx512.unrolledUnpackVector19(data, offset, len); + unpackAvx512.vectorUnpack19(data, offset, len); break; case 20: - unpackAvx512.unrolledUnpackVector20(data, offset, len); + unpackAvx512.vectorUnpack20(data, offset, len); break; case 21: - unpackAvx512.unrolledUnpackVector21(data, offset, len); + unpackAvx512.vectorUnpack21(data, offset, len); break; case 22: - unpackAvx512.unrolledUnpackVector22(data, offset, len); + unpackAvx512.vectorUnpack22(data, offset, len); break; case 23: - unpackAvx512.unrolledUnpackVector23(data, offset, len); + unpackAvx512.vectorUnpack23(data, offset, len); break; case 24: - unpackAvx512.unrolledUnpackVector24(data, offset, len); + unpackAvx512.vectorUnpack24(data, offset, len); break; case 26: - unpackAvx512.unrolledUnpackVector26(data, offset, len); + unpackAvx512.vectorUnpack26(data, offset, len); break; case 28: - unpackAvx512.unrolledUnpackVector28(data, offset, len); + unpackAvx512.vectorUnpack28(data, offset, len); break; case 30: - unpackAvx512.unrolledUnpackVector30(data, offset, len); + unpackAvx512.vectorUnpack30(data, offset, len); break; case 32: - unpackAvx512.unrolledUnpackVector32(data, offset, len); + unpackAvx512.vectorUnpack32(data, offset, len); break; case 40: unpackDefault.unrolledUnpack40(data, offset, len); diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index 5faab97a26..412c9341d5 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -23,9 +23,7 @@ namespace orc { #if defined(ORC_HAVE_RUNTIME_AVX512) - UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) - : decoder(dec), - unpackDefault(UnpackDefault(dec)) { + UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), unpackDefault(UnpackDefault(dec)) { // PASS } @@ -33,7 +31,7 @@ namespace orc { // PASS } - void UnpackAvx512::unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack1(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 1; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint32_t numElements = 0; @@ -154,7 +152,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack2(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 2; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint32_t numElements = 0; @@ -298,7 +296,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack3(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 3; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint32_t numElements = 0; @@ -442,7 +440,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack4(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 4; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -574,7 +572,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack5(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 5; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -718,7 +716,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector6(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack6(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 6; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -862,7 +860,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector7(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack7(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 7; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -1006,7 +1004,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector9(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack9(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 9; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -1200,7 +1198,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector10(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack10(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 10; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -1332,7 +1330,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector11(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack11(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 11; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -1535,7 +1533,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector12(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack12(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 12; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -1667,7 +1665,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector13(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack13(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 13; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -1870,7 +1868,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector14(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack14(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 14; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -2014,7 +2012,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector15(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack15(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 15; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -2217,7 +2215,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector16(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack16(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 16; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = len; @@ -2297,7 +2295,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector17(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack17(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 17; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -2491,7 +2489,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector18(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack18(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 18; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -2685,7 +2683,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector19(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack19(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 19; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -2879,7 +2877,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector20(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack20(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 20; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -3011,7 +3009,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector21(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack21(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 21; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -3205,7 +3203,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector22(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack22(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 22; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -3399,7 +3397,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector23(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack23(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 23; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -3594,7 +3592,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector24(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack24(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 24; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -3683,7 +3681,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector26(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack26(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 26; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -3877,7 +3875,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector28(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack28(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 28; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -4009,7 +4007,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector30(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack30(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 30; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; @@ -4211,7 +4209,7 @@ namespace orc { } } - void UnpackAvx512::unrolledUnpackVector32(int64_t* data, uint64_t offset, uint64_t len) { + void UnpackAvx512::vectorUnpack32(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 32; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); uint64_t numElements = 0; diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index 77daa9eecf..a885b2b640 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -40,33 +40,33 @@ namespace orc { UnpackAvx512(RleDecoderV2* dec); ~UnpackAvx512(); - void unrolledUnpackVector1(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector2(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector3(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector4(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector5(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector6(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector7(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector9(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector10(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector11(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector12(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector13(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector14(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector15(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector16(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector17(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector18(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector19(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector20(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector21(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector22(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector23(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector24(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector26(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector28(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector30(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpackVector32(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack1(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack2(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack3(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack4(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack5(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack6(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack7(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack9(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack10(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack11(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack12(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack13(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack14(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack15(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack16(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack17(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack18(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack19(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack20(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack21(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack22(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack23(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack24(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack26(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack28(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack30(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack32(int64_t* data, uint64_t offset, uint64_t len); void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, uint64_t& startBit); diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc index 4d3ddea1ed..50e3e2520f 100644 --- a/c++/src/BpackingDefault.cc +++ b/c++/src/BpackingDefault.cc @@ -21,8 +21,7 @@ namespace orc { - UnpackDefault::UnpackDefault(RleDecoderV2* dec) - : decoder(dec) { + UnpackDefault::UnpackDefault(RleDecoderV2* dec) : decoder(dec) { // PASS } @@ -44,7 +43,7 @@ namespace orc { uint64_t numGroups = (offset + len - curIdx) / 2; numGroups = std::min(numGroups, static_cast(decoder->bufferEnd - decoder->bufferStart)); - // Avoid updating 'decoder->bufferStart' inside the loop. + // Avoid updating 'bufferStart' inside the loop. const auto* buffer = reinterpret_cast(decoder->bufferStart); uint32_t localByte; for (uint64_t i = 0; i < numGroups; ++i) { @@ -88,7 +87,7 @@ namespace orc { int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 2; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint16_t b0, b1; - // Avoid updating 'decoder->bufferStart' inside the loop. + // Avoid updating 'bufferStart' inside the loop. const auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); @@ -99,8 +98,7 @@ namespace orc { decoder->bufferStart = (char*)buffer; if (curIdx == offset + len) return; - // One of the following readByte() will update 'decoder->bufferStart' and - // 'decoder->bufferEnd'. + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); data[curIdx++] = (b0 << 8) | b1; @@ -114,7 +112,7 @@ namespace orc { int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 3; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint32_t b0, b1, b2; - // Avoid updating 'decoder->bufferStart' inside the loop. + // Avoid updating 'bufferStart' inside the loop. const auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); @@ -126,8 +124,7 @@ namespace orc { decoder->bufferStart += bufferNum * 3; if (curIdx == offset + len) return; - // One of the following readByte() will update 'decoder->bufferStart' and - // 'decoder->bufferEnd'. + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); @@ -142,7 +139,7 @@ namespace orc { int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 4; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint32_t b0, b1, b2, b3; - // Avoid updating 'decoder->bufferStart' inside the loop. + // Avoid updating 'bufferStart' inside the loop. const auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); @@ -155,8 +152,7 @@ namespace orc { decoder->bufferStart = (char*)buffer; if (curIdx == offset + len) return; - // One of the following readByte() will update 'decoder->bufferStart' and - // 'decoder->bufferEnd'. + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); @@ -172,7 +168,7 @@ namespace orc { int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 5; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4; - // Avoid updating 'decoder->bufferStart' inside the loop. + // Avoid updating 'bufferStart' inside the loop. const auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); @@ -187,8 +183,7 @@ namespace orc { decoder->bufferStart = (char*)buffer; if (curIdx == offset + len) return; - // One of the following readByte() will update 'decoder->bufferStart' and - // 'decoder->bufferEnd'. + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); @@ -205,7 +200,7 @@ namespace orc { int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 6; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5; - // Avoid updating 'decoder->bufferStart' inside the loop. + // Avoid updating 'bufferStart' inside the loop. const auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); @@ -221,8 +216,7 @@ namespace orc { decoder->bufferStart = (char*)buffer; if (curIdx == offset + len) return; - // One of the following readByte() will update 'decoder->bufferStart' and - // 'decoder->bufferEnd'. + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); @@ -241,7 +235,7 @@ namespace orc { int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 7; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5, b6; - // Avoid updating 'decoder->bufferStart' inside the loop. + // Avoid updating 'bufferStart' inside the loop. const auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); @@ -258,8 +252,7 @@ namespace orc { decoder->bufferStart = (char*)buffer; if (curIdx == offset + len) return; - // One of the following readByte() will update 'decoder->bufferStart' and - // 'decoder->bufferEnd'. + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); @@ -279,7 +272,7 @@ namespace orc { int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 8; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5, b6, b7; - // Avoid updating 'decoder->bufferStart' inside the loop. + // Avoid updating 'bufferStart' inside the loop. const auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); @@ -297,8 +290,7 @@ namespace orc { decoder->bufferStart = (char*)buffer; if (curIdx == offset + len) return; - // One of the following readByte() will update 'decoder->bufferStart' and - // 'decoder->bufferEnd'. + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index 02e7a7cb60..e8ea88cb35 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -426,9 +426,6 @@ namespace orc { bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { enum { USER_SIMD_NONE, - USER_SIMD_SSE4_2, - USER_SIMD_AVX, - USER_SIMD_AVX2, USER_SIMD_AVX512, USER_SIMD_MAX, }; @@ -437,12 +434,6 @@ namespace orc { // Parse the level if (simd_level == "AVX512") { level = USER_SIMD_AVX512; - } else if (simd_level == "AVX2") { - level = USER_SIMD_AVX2; - } else if (simd_level == "AVX") { - level = USER_SIMD_AVX; - } else if (simd_level == "SSE4_2") { - level = USER_SIMD_SSE4_2; } else if (simd_level == "NONE") { level = USER_SIMD_NONE; } else { @@ -453,20 +444,11 @@ namespace orc { if (level < USER_SIMD_AVX512) { *hardware_flags &= ~CpuInfo::AVX512; } - if (level < USER_SIMD_AVX2) { - *hardware_flags &= ~(CpuInfo::AVX2 | CpuInfo::BMI2); - } - if (level < USER_SIMD_AVX) { - *hardware_flags &= ~CpuInfo::AVX; - } - if (level < USER_SIMD_SSE4_2) { - *hardware_flags &= ~(CpuInfo::SSE4_2 | CpuInfo::BMI1); - } return true; } void ArchVerifyCpuRequirements(const CpuInfo* ci) { -#if defined(ORC_HAVE_AVX512) +#if defined(ORC_HAVE_RUNTIME_AVX512) if (!ci->IsDetected(CpuInfo::AVX512)) { throw ParseError("CPU does not support the Supplemental AVX512 instruction set"); } @@ -489,15 +471,7 @@ namespace orc { } } -#else - //------------------------------ PPC, ... ------------------------------// - bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { - return true; - } - - void ArchVerifyCpuRequirements(const CpuInfo* ci) {} - -#endif // X86, ARM, PPC +#endif // X86, ARM } // namespace @@ -524,16 +498,6 @@ namespace orc { throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel); } } - - // void EnableFeature(int64_t flag, bool enable) { - // if (!enable) { - // hardware_flags &= ~flag; - // } else { - // // Can't turn something on that can't be supported - // DCHECK_EQ((~original_hardware_flags) & flag, 0); - // hardware_flags |= (flag & original_hardware_flags); - // } - // } }; CpuInfo::~CpuInfo() = default; @@ -589,10 +553,6 @@ namespace orc { return ArchVerifyCpuRequirements(this); } - // void CpuInfo::EnableFeature(int64_t flag, bool enable) { - // impl_->EnableFeature(flag, enable); - // } - } // namespace orc #undef CPUINFO_ARCH_X86 diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index ba5dd0d81a..c56ef7cfa3 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -202,18 +202,6 @@ namespace orc { int64_t readVslong(); uint64_t readVulong(); int readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); - void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, - uint64_t& startBit); - - void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len); template uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull); @@ -228,10 +216,9 @@ namespace orc { const std::unique_ptr inputStream; const bool isSigned; - unsigned char firstByte; - uint64_t runLength; // Length of the current run - uint64_t runRead; // Number of returned values of the current run + uint64_t runLength; // Length of the current run + uint64_t runRead; // Number of returned values of the current run DataBuffer unpackedPatch; // Used by PATCHED_BASE DataBuffer literals; // Values of the current run }; diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index a1ea19619e..362ff599e2 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -17,8 +17,8 @@ */ #include "Adaptor.hh" -#include "Compression.hh" #include "Bpacking.hh" +#include "Compression.hh" #include "Dispatch.hh" #include "RLEV2Util.hh" #include "RLEv2.hh" diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index bd10440e6f..3ee923fd74 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -16,8 +16,6 @@ * limitations under the License. */ -#include - #include #include "MemoryOutputStream.hh" @@ -108,8 +106,7 @@ namespace orc { int32_t lpad = offset * BARWIDTH / total; int32_t rpad = BARWIDTH - lpad; - printf("\r%s:%3d%% [%.*s%*s] [%" PRId64 "/%" PRId64 "]", testName, val, lpad, BARSTR, rpad, "", - offset, total); + printf("\r%s:%3d%% [%.*s%*s] [%ld /%ld]", testName, val, lpad, BARSTR, rpad, "", offset, total); fflush(stdout); } From eeafccfe6973f8c3d1bef6e65b0ef468c2071a68 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 13 Feb 2023 23:33:24 -0500 Subject: [PATCH 24/80] Fix some comments --- CMakeLists.txt | 1 + c++/src/Bpacking.cc | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fcb1c2a90d..daa38b2d45 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -235,6 +235,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(ORC_SIMD_LEVEL "AVX512") add_definitions(-DORC_HAVE_RUNTIME_AVX512) else () + set(ORC_HAVE_RUNTIME_AVX512 OFF) message(STATUS "Disable the AVX512 vector decode of bit-packing") endif() if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") diff --git a/c++/src/Bpacking.cc b/c++/src/Bpacking.cc index 5c37b6f195..ca2c703ae5 100644 --- a/c++/src/Bpacking.cc +++ b/c++/src/Bpacking.cc @@ -60,7 +60,6 @@ namespace orc { } #if defined(ORC_HAVE_RUNTIME_AVX512) - // template int readLongsAvx512(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { UnpackAvx512 unpackAvx512(decoder); From d9c562b76edcbf077022543114cda228957d70fb Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 15 Feb 2023 22:00:05 -0500 Subject: [PATCH 25/80] 1.Modified the CMakelists, delete the part of aarch64 and ORC_RUNTIME_SIMD_LEVEL, also changed the message content 2.Modified the print content and style about BUILD_ENABLE_AVX512, CXX_SUPPORTS_AVX512, ORC_HAVE_RUNTIME_AVX512 and ORC_SIMD_LEVEL Delete the print of CXX_SUPPORTS_AVX512 3.Separate the configuration of AVX512 from CMakelists, and create a new cmake module "cmake_modules/ConfigSimdLevel.cmake" file 4.Modified the style of code comments --- CMakeLists.txt | 108 +--------------------------- c++/src/BitUnpackerAvx512.hh | 10 +-- c++/src/Bpacking.cc | 8 ++- c++/src/Bpacking.hh | 5 +- c++/src/BpackingAvx512.cc | 7 +- c++/src/BpackingAvx512.hh | 5 +- c++/src/CpuInfoUtil.cc | 15 ++-- c++/src/CpuInfoUtil.hh | 63 ++++++++-------- c++/src/Dispatch.hh | 56 +++++++-------- cmake_modules/ConfigSimdLevel.cmake | 95 ++++++++++++++++++++++++ 10 files changed, 183 insertions(+), 189 deletions(-) create mode 100644 cmake_modules/ConfigSimdLevel.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index daa38b2d45..fb759d1a51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,17 +91,6 @@ if (BUILD_POSITION_INDEPENDENT_LIB) set(CMAKE_POSITION_INDEPENDENT_CODE ON) endif () -if(NOT DEFINED ORC_SIMD_LEVEL) - set(ORC_SIMD_LEVEL - "DEFAULT" - CACHE STRING "Compile time SIMD optimization level") -endif() -if(NOT DEFINED ORC_RUNTIME_SIMD_LEVEL) - set(ORC_RUNTIME_SIMD_LEVEL - "MAX" - CACHE STRING "Max runtime SIMD optimization level") -endif() - # # Compiler specific flags # @@ -172,102 +161,6 @@ elseif (MSVC) set (WARN_FLAGS "${WARN_FLAGS} -wd4146") # unary minus operator applied to unsigned type, result still unsigned endif () -include(CheckCXXCompilerFlag) -include(CheckCXXSourceCompiles) -message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") - -if(NOT DEFINED ORC_CPU_FLAG) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|X86|x86|i[3456]86|x64") - set(ORC_CPU_FLAG "x86") - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") - set(ORC_CPU_FLAG "aarch64") - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm$|armv[4-7]") - set(ORC_CPU_FLAG "aarch32") - else() - message(FATAL_ERROR "Unknown system processor") - endif() -endif() - -# Check architecture specific compiler flags -if(ORC_CPU_FLAG STREQUAL "x86") - # x86/amd64 compiler flags, msvc/gcc/clang - if(MSVC) - set(ORC_AVX512_FLAG "/arch:AVX512") - set(CXX_SUPPORTS_SSE4_2 TRUE) - else() - # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ - set(ORC_AVX512_FLAG "-march=native -mbmi2") - set(ORC_AVX512_FLAG - "${ORC_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi") - endif() - check_cxx_compiler_flag(${ORC_AVX512_FLAG} CXX_SUPPORTS_AVX512) - if(MINGW) - # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 - message(STATUS "Disable AVX512 support on MINGW for now") - else() - # Check for AVX512 support in the compiler. - set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ORC_AVX512_FLAG}") - check_cxx_source_compiles(" - #ifdef _MSC_VER - #include - #else - #include - #endif - - int main() { - __m512i mask = _mm512_set1_epi32(0x1); - char out[32]; - _mm512_storeu_si512(out, mask); - return 0; - }" - CXX_SUPPORTS_AVX512) - set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) - endif() - - message(STATUS "BUILD_ENABLE_AVX512=${BUILD_ENABLE_AVX512}") - message(STATUS "CXX_SUPPORTS_AVX512=${CXX_SUPPORTS_AVX512}") - message(STATUS "ORC_RUNTIME_SIMD_LEVEL=${ORC_RUNTIME_SIMD_LEVEL}") - # Runtime SIMD level it can get from compiler and ORC_RUNTIME_SIMD_LEVEL - if(BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512 AND ORC_RUNTIME_SIMD_LEVEL MATCHES "^(AVX512|MAX)$") - message(STATUS "Enable the AVX512 vector decode of bit-packing") - set(ORC_HAVE_RUNTIME_AVX512 ON) - set(ORC_SIMD_LEVEL "AVX512") - add_definitions(-DORC_HAVE_RUNTIME_AVX512) - else () - set(ORC_HAVE_RUNTIME_AVX512 OFF) - message(STATUS "Disable the AVX512 vector decode of bit-packing") - endif() - if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") - set(ORC_SIMD_LEVEL "NONE") - endif() -elseif(ORC_CPU_FLAG STREQUAL "aarch64") - # Arm64 compiler flags, gcc/clang only - set(ORC_ARMV8_MARCH "armv8-a") - check_cxx_compiler_flag("-march=${ORC_ARMV8_MARCH}+sve" CXX_SUPPORTS_SVE) - if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") - set(ORC_SIMD_LEVEL "NEON") - endif() -endif() -message(STATUS "ORC_HAVE_RUNTIME_AVX512=${ORC_HAVE_RUNTIME_AVX512}") -message(STATUS "ORC_SIMD_LEVEL=${ORC_SIMD_LEVEL}") - -# Only enable additional instruction sets if they are supported -if(ORC_CPU_FLAG STREQUAL "x86") - if(MINGW) - # Enable _xgetbv() intrinsic to query OS support for ZMM register saves - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mxsave") - endif() - if(ORC_SIMD_LEVEL STREQUAL "AVX512") - if(NOT CXX_SUPPORTS_AVX512) - message(FATAL_ERROR "AVX512 required but compiler doesn't support it.") - endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_AVX512_FLAG}") - elseif(NOT ORC_SIMD_LEVEL STREQUAL "NONE") - message(WARNING "ORC_SIMD_LEVEL=${ORC_SIMD_LEVEL} not supported by x86.") - endif() -endif() - if (BUILD_CPP_ENABLE_METRICS) message(STATUS "Enable the metrics collection") add_compile_definitions(ENABLE_METRICS=1) @@ -280,6 +173,7 @@ enable_testing() INCLUDE(CheckSourceCompiles) INCLUDE(ThirdpartyToolchain) +INCLUDE(ConfigSimdLevel) set (EXAMPLE_DIRECTORY ${CMAKE_SOURCE_DIR}/examples) diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh index 688acc728e..55b662271c 100644 --- a/c++/src/BitUnpackerAvx512.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -16,8 +16,10 @@ * limitations under the License. */ -#ifndef VECTOR_DECODER_HH -#define VECTOR_DECODER_HH +#ifndef BIT_UNPACKER_AVX512_HH +#define BIT_UNPACKER_AVX512_HH + +#if defined(ORC_HAVE_RUNTIME_AVX512) // Mingw-w64 defines strcasecmp in string.h #if defined(_WIN32) && !defined(strcasecmp) @@ -27,7 +29,6 @@ #include #endif -#if defined(ORC_HAVE_RUNTIME_AVX512) #include #include @@ -484,5 +485,6 @@ namespace orc { return result; } } // namespace orc -#endif + +#endif // #if defined(ORC_HAVE_RUNTIME_AVX512) #endif diff --git a/c++/src/Bpacking.cc b/c++/src/Bpacking.cc index ca2c703ae5..64f11f013f 100644 --- a/c++/src/Bpacking.cc +++ b/c++/src/Bpacking.cc @@ -17,7 +17,11 @@ */ #include "Bpacking.hh" +#include "BpackingDefault.hh" #include "CpuInfoUtil.hh" +#if defined(ORC_HAVE_RUNTIME_AVX512) +#include "BpackingAvx512.hh" +#endif namespace orc { int readLongsDefault(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, @@ -65,8 +69,8 @@ namespace orc { UnpackAvx512 unpackAvx512(decoder); UnpackDefault unpackDefault(decoder); uint64_t startBit = 0; - static const auto cpu_info = orc::CpuInfo::GetInstance(); - if (cpu_info->IsSupported(CpuInfo::AVX512)) { + static const auto cpu_info = CpuInfo::getInstance(); + if (cpu_info->isSupported(CpuInfo::AVX512)) { switch (fbs) { case 1: unpackAvx512.vectorUnpack1(data, offset, len); diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh index 0366d747b5..4aa1e1fc88 100644 --- a/c++/src/Bpacking.hh +++ b/c++/src/Bpacking.hh @@ -21,10 +21,7 @@ #include -#include "BpackingDefault.hh" -#if defined(ORC_HAVE_RUNTIME_AVX512) -#include "BpackingAvx512.hh" -#endif +#include "RLEv2.hh" namespace orc { int readLongsDefault(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index 412c9341d5..9646e665ee 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -16,13 +16,13 @@ * limitations under the License. */ +#if defined(ORC_HAVE_RUNTIME_AVX512) + #include "BpackingAvx512.hh" #include "BitUnpackerAvx512.hh" #include "Utils.hh" namespace orc { - -#if defined(ORC_HAVE_RUNTIME_AVX512) UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), unpackDefault(UnpackDefault(dec)) { // PASS } @@ -4312,6 +4312,7 @@ namespace orc { startBit = decoder->bitsLeft == 0 ? 0 : (8 - decoder->bitsLeft); } } -#endif } // namespace orc + +#endif // #if defined(ORC_HAVE_RUNTIME_AVX512) diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index a885b2b640..1ed3d5eca4 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -19,6 +19,8 @@ #ifndef ORC_BPACKINGAVX512_HH #define ORC_BPACKINGAVX512_HH +#if defined(ORC_HAVE_RUNTIME_AVX512) + #include #include @@ -34,7 +36,6 @@ namespace orc { #define MAX_VECTOR_BUF_16BIT_LENGTH 32 #define MAX_VECTOR_BUF_32BIT_LENGTH 16 -#if defined(ORC_HAVE_RUNTIME_AVX512) class UnpackAvx512 { public: UnpackAvx512(RleDecoderV2* dec); @@ -82,8 +83,8 @@ namespace orc { // Used by vectorially 17~32 bit-unpacking data uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + 1]; }; -#endif } // namespace orc +#endif // #if defined(ORC_HAVE_RUNTIME_AVX512) #endif diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index e8ea88cb35..1519f04bcc 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -27,6 +27,7 @@ #endif #ifdef _WIN32 +#define NOMINMAX #include #include #endif @@ -449,7 +450,7 @@ namespace orc { void ArchVerifyCpuRequirements(const CpuInfo* ci) { #if defined(ORC_HAVE_RUNTIME_AVX512) - if (!ci->IsDetected(CpuInfo::AVX512)) { + if (!ci->isDetected(CpuInfo::AVX512)) { throw ParseError("CPU does not support the Supplemental AVX512 instruction set"); } #endif @@ -466,7 +467,7 @@ namespace orc { } void ArchVerifyCpuRequirements(const CpuInfo* ci) { - if (!ci->IsDetected(CpuInfo::ASIMD)) { + if (!ci->isDetected(CpuInfo::ASIMD)) { throw ParseError("CPU does not support the Armv8 Neon instruction set"); } } @@ -504,7 +505,7 @@ namespace orc { CpuInfo::CpuInfo() : impl_(new Impl) {} - const CpuInfo* CpuInfo::GetInstance() { + const CpuInfo* CpuInfo::getInstance() { static CpuInfo cpu_info; return &cpu_info; } @@ -525,7 +526,7 @@ namespace orc { return impl_->model_name; } - int64_t CpuInfo::CacheSize(CacheLevel level) const { + int64_t CpuInfo::cacheSize(CacheLevel level) const { constexpr int64_t kDefaultCacheSizes[] = { 32 * 1024, // Level 1: 32K 256 * 1024, // Level 2: 256K @@ -541,15 +542,15 @@ namespace orc { return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]); } - bool CpuInfo::IsSupported(int64_t flags) const { + bool CpuInfo::isSupported(int64_t flags) const { return (impl_->hardware_flags & flags) == flags; } - bool CpuInfo::IsDetected(int64_t flags) const { + bool CpuInfo::isDetected(int64_t flags) const { return (impl_->original_hardware_flags & flags) == flags; } - void CpuInfo::VerifyCpuRequirements() const { + void CpuInfo::verifyCpuRequirements() const { return ArchVerifyCpuRequirements(this); } diff --git a/c++/src/CpuInfoUtil.hh b/c++/src/CpuInfoUtil.hh index dfc3a75843..d2c2a8d7b1 100644 --- a/c++/src/CpuInfoUtil.hh +++ b/c++/src/CpuInfoUtil.hh @@ -25,15 +25,17 @@ namespace orc { - /// CpuInfo is an interface to query for cpu information at runtime. The caller can - /// ask for the sizes of the caches and what hardware features are supported. - /// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and - /// /sys/devices) + /** + * CpuInfo is an interface to query for cpu information at runtime. The caller can + * ask for the sizes of the caches and what hardware features are supported. + * On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and + * /sys/devices) + */ class CpuInfo { public: ~CpuInfo(); - /// x86 features + // x86 features static constexpr int64_t SSSE3 = (1LL << 0); static constexpr int64_t SSE4_1 = (1LL << 1); static constexpr int64_t SSE4_2 = (1LL << 2); @@ -49,53 +51,50 @@ namespace orc { static constexpr int64_t BMI1 = (1LL << 11); static constexpr int64_t BMI2 = (1LL << 12); - /// Arm features + // Arm features static constexpr int64_t ASIMD = (1LL << 32); - /// Cache enums for L1 (data), L2 and L3 + // Cache enums for L1 (data), L2 and L3 enum class CacheLevel { L1 = 0, L2, L3, Last = L3 }; - /// CPU vendors + // CPU vendors enum class Vendor { Unknown, Intel, AMD }; - static const CpuInfo* GetInstance(); + static const CpuInfo* getInstance(); - /// Returns all the flags for this cpu + // Returns all the flags for this cpu int64_t hardwareFlags() const; - /// Returns the number of cores (including hyper-threaded) on this machine. + // Returns the number of cores (including hyper-threaded) on this machine. int numCores() const; - /// Returns the vendor of the cpu. + // Returns the vendor of the cpu. Vendor vendor() const; - /// Returns the model name of the cpu (e.g. Intel i7-2600) + // Returns the model name of the cpu (e.g. Intel i7-2600) const std::string& modelName() const; - /// Returns the size of the cache in KB at this cache level - int64_t CacheSize(CacheLevel level) const; + // Returns the size of the cache in KB at this cache level + int64_t cacheSize(CacheLevel level) const; - /// \brief Returns whether or not the given feature is enabled. - /// - /// IsSupported() is true if IsDetected() is also true and the feature - /// wasn't disabled by the user (for example by setting the ORC_USER_SIMD_LEVEL - /// environment variable). - bool IsSupported(int64_t flags) const; + /** + * Returns whether or not the given feature is enabled. + * isSupported() is true if isDetected() is also true and the feature + * wasn't disabled by the user (for example by setting the ORC_USER_SIMD_LEVEL + * environment variable). + */ + bool isSupported(int64_t flags) const; - /// Returns whether or not the given feature is available on the CPU. - bool IsDetected(int64_t flags) const; + // Returns whether or not the given feature is available on the CPU. + bool isDetected(int64_t flags) const; - /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error - /// and terminate. - void VerifyCpuRequirements() const; + // Determine if the CPU meets the minimum CPU requirements and if not, issue an error + // and terminate. + void verifyCpuRequirements() const; - /// Toggle a hardware feature on and off. It is not valid to turn on a feature - /// that the underlying hardware cannot support. This is useful for testing. - // void EnableFeature(int64_t flag, bool enable); - - bool HasEfficientBmi2() const { + bool hasEfficientBmi2() const { // BMI2 (pext, pdep) is only efficient on Intel X86 processors. - return vendor() == Vendor::Intel && IsSupported(BMI2); + return vendor() == Vendor::Intel && isSupported(BMI2); } private: diff --git a/c++/src/Dispatch.hh b/c++/src/Dispatch.hh index 4185ad9b48..7f4382278f 100644 --- a/c++/src/Dispatch.hh +++ b/c++/src/Dispatch.hh @@ -33,32 +33,32 @@ namespace orc { MAX }; - /* - A facility for dynamic dispatch according to available DispatchLevel. - - Typical use: - - static void my_function_default(...); - static void my_function_avx512(...); - - struct MyDynamicFunction { - using FunctionType = decltype(&my_function_default); - - static std::vector> implementations() { - return { - { DispatchLevel::NONE, my_function_default } - #if defined(ARROW_HAVE_RUNTIME_AVX512) - , { DispatchLevel::AVX512, my_function_avx512 } - #endif - }; - } - }; - - void my_function(...) { - static DynamicDispatch dispatch; - return dispatch.func(...); - } - */ + /** + * A facility for dynamic dispatch according to available DispatchLevel. + * + * Typical use: + * + * static void my_function_default(...); + * static void my_function_avx512(...); + * + * struct MyDynamicFunction { + * using FunctionType = decltype(&my_function_default); + * + * static std::vector> implementations() { + * return { + * { DispatchLevel::NONE, my_function_default } + * #if defined(ORC_HAVE_RUNTIME_AVX512) + * , { DispatchLevel::AVX512, my_function_avx512 } + * #endif + * }; + * } + * }; + * + * void my_function(...) { + * static DynamicDispatch dispatch; + * return dispatch.func(...); + * } + */ template class DynamicDispatch { protected: @@ -92,13 +92,13 @@ namespace orc { private: bool IsSupported(DispatchLevel level) const { - static const auto cpu_info = orc::CpuInfo::GetInstance(); + static const auto cpu_info = CpuInfo::getInstance(); switch (level) { case DispatchLevel::NONE: return true; case DispatchLevel::AVX512: - return cpu_info->IsSupported(CpuInfo::AVX512); + return cpu_info->isSupported(CpuInfo::AVX512); default: return false; } diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake new file mode 100644 index 0000000000..6f3c6c3a18 --- /dev/null +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -0,0 +1,95 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(CheckCXXCompilerFlag) +message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") + +if(NOT DEFINED ORC_SIMD_LEVEL) + set(ORC_SIMD_LEVEL + "DEFAULT" + CACHE STRING "Compile time SIMD optimization level") +endif() + +if(NOT DEFINED ORC_CPU_FLAG) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|X86|x86|i[3456]86|x64") + set(ORC_CPU_FLAG "x86") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm$|armv[4-7]") + set(ORC_CPU_FLAG "aarch32") + else() + message(STATUS "Unknown system processor") + endif() +endif() + +# Check architecture specific compiler flags +if(ORC_CPU_FLAG STREQUAL "x86") + # x86/amd64 compiler flags, msvc/gcc/clang + if(MSVC) + set(ORC_AVX512_FLAG "/arch:AVX512") + else() + # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ + set(ORC_AVX512_FLAG "-march=native -mbmi2 -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi") + endif() + check_cxx_compiler_flag(${ORC_AVX512_FLAG} CXX_SUPPORTS_AVX512) + if(MINGW) + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 + message(STATUS "Disable AVX512 support on MINGW for now") + else() + # Check for AVX512 support in the compiler. + set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ORC_AVX512_FLAG}") + check_cxx_source_compiles(" + #ifdef _MSC_VER + #include + #else + #include + #endif + + int main() { + __m512i mask = _mm512_set1_epi32(0x1); + char out[32]; + _mm512_storeu_si512(out, mask); + return 0; + }" + CXX_SUPPORTS_AVX512) + set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) + endif() + + message(STATUS "BUILD_ENABLE_AVX512: ${BUILD_ENABLE_AVX512}") + # Runtime SIMD level it can get from compiler + if(BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512) + message(STATUS "Enable the AVX512 vector decode of bit-packing, compiler support AVX512") + set(ORC_HAVE_RUNTIME_AVX512 ON) + set(ORC_SIMD_LEVEL "AVX512") + add_definitions(-DORC_HAVE_RUNTIME_AVX512) + elseif(BUILD_ENABLE_AVX512 AND NOT CXX_SUPPORTS_AVX512) + message(FATAL_ERROR "AVX512 required but compiler doesn't support it.") + elseif(NOT BUILD_ENABLE_AVX512) + set(ORC_HAVE_RUNTIME_AVX512 OFF) + message(STATUS "Disable the AVX512 vector decode of bit-packing") + endif() + if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") + set(ORC_SIMD_LEVEL "NONE") + endif() + + # Enable additional instruction sets if they are supported + if(MINGW) + # Enable _xgetbv() intrinsic to query OS support for ZMM register saves + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mxsave") + endif() + if(ORC_SIMD_LEVEL STREQUAL "AVX512") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_AVX512_FLAG}") + elseif(NOT ORC_SIMD_LEVEL STREQUAL "NONE") + message(WARNING "ORC_SIMD_LEVEL=${ORC_SIMD_LEVEL} not supported by x86.") + endif() +endif() + +message(STATUS "ORC_HAVE_RUNTIME_AVX512: ${ORC_HAVE_RUNTIME_AVX512}, ORC_SIMD_LEVEL: ${ORC_SIMD_LEVEL}") From 0cf5620b79c088ed55bbb16eeda9f10cd214f31a Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 15 Feb 2023 22:33:46 -0500 Subject: [PATCH 26/80] Modified the macro name --- c++/src/BitUnpackerAvx512.hh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh index 55b662271c..dbc582cfae 100644 --- a/c++/src/BitUnpackerAvx512.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -16,8 +16,8 @@ * limitations under the License. */ -#ifndef BIT_UNPACKER_AVX512_HH -#define BIT_UNPACKER_AVX512_HH +#ifndef ORC_BIT_UNPACKER_AVX512_HH +#define ORC_BIT_UNPACKER_AVX512_HH #if defined(ORC_HAVE_RUNTIME_AVX512) From 1b8301fe009fd71408ecdaeb0bbd4ed95c2cbdd2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 16 Feb 2023 21:12:24 -0500 Subject: [PATCH 27/80] 1.Fixed build error on macos 2.Modified code format in implementations() --- c++/src/Dispatch.hh | 5 +++-- c++/src/RleDecoderV2.cc | 11 +++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/c++/src/Dispatch.hh b/c++/src/Dispatch.hh index 7f4382278f..489317b28a 100644 --- a/c++/src/Dispatch.hh +++ b/c++/src/Dispatch.hh @@ -78,7 +78,7 @@ namespace orc { Implementation cur{DispatchLevel::NONE, {}}; for (const auto& impl : implementations) { - if (impl.first >= cur.first && IsSupported(impl.first)) { + if (impl.first >= cur.first && levelSupported(impl.first)) { // Higher (or same) level than current cur = impl; } @@ -91,13 +91,14 @@ namespace orc { } private: - bool IsSupported(DispatchLevel level) const { + bool levelSupported(DispatchLevel level) const { static const auto cpu_info = CpuInfo::getInstance(); switch (level) { case DispatchLevel::NONE: return true; case DispatchLevel::AVX512: + case DispatchLevel::MAX: return cpu_info->isSupported(CpuInfo::AVX512); default: return false; diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 362ff599e2..94a70c2bce 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -48,7 +48,7 @@ namespace orc { if (bufferPointer == nullptr) { *bufStart += len; } else { - *bufStart = (char*)bufferPointer; + *bufStart = const_cast(static_cast(bufferPointer)); *bufEnd = *bufStart + bufferLength; } } @@ -61,7 +61,7 @@ namespace orc { if (!inputStream->Next(&bufferPointer, &bufferLength)) { throw ParseError("bad read in RleDecoderV2::readByte"); } - *bufStart = (char*)bufferPointer; + *bufStart = const_cast(static_cast(bufferPointer)); *bufEnd = *bufStart + bufferLength; } @@ -99,12 +99,11 @@ namespace orc { using FunctionType = decltype(&readLongsDefault); static std::vector> implementations() { - return {{DispatchLevel::NONE, readLongsDefault} #if defined(ORC_HAVE_RUNTIME_AVX512) - , - {DispatchLevel::AVX512, readLongsAvx512} + return {{DispatchLevel::NONE, readLongsDefault}, {DispatchLevel::AVX512, readLongsAvx512}}; +#else + return {{DispatchLevel::NONE, readLongsDefault}}; #endif - }; } }; From b37c7dd41f8f3a3adb2555a707dfead9f6309838 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 17 Feb 2023 04:54:04 -0500 Subject: [PATCH 28/80] Fixed the build error on macos. --- c++/src/CpuInfoUtil.cc | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index 1519f04bcc..0d91c2bea7 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -43,6 +43,7 @@ #include #include #include +#include #include "orc/Exceptions.hh" @@ -222,8 +223,7 @@ namespace orc { // ENOENT is the official errno value for non-existing sysctl's, // but EINVAL and ENOTSUP have been seen in the wild. if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) { - auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'"); - throw ParseError(st.ToString()); + throw ParseError("sysctlbyname failed for '" + name + "'"); } return std::nullopt; } @@ -277,6 +277,8 @@ namespace orc { } // TODO: vendor, model_name + vendor = Vendor::Unknown; + *model_name = "Unknown"; } #else @@ -505,11 +507,20 @@ namespace orc { CpuInfo::CpuInfo() : impl_(new Impl) {} +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wexit-time-destructors" +#endif + const CpuInfo* CpuInfo::getInstance() { static CpuInfo cpu_info; return &cpu_info; } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + int64_t CpuInfo::hardwareFlags() const { return impl_->hardware_flags; } From 23dd7ff9caa4b587264b9909da0cd18c945aa69e Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 17 Feb 2023 17:57:16 -0500 Subject: [PATCH 29/80] Fix the build error on macos, and code format. --- c++/src/CpuInfoUtil.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index 0d91c2bea7..dcec9198a6 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -223,7 +224,9 @@ namespace orc { // ENOENT is the official errno value for non-existing sysctl's, // but EINVAL and ENOTSUP have been seen in the wild. if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) { - throw ParseError("sysctlbyname failed for '" + name + "'"); + std::ostringstream ss; + ss << "sysctlbyname failed for '" << name << "'"; + throw ParseError(ss.str()); } return std::nullopt; } @@ -277,7 +280,7 @@ namespace orc { } // TODO: vendor, model_name - vendor = Vendor::Unknown; + *vendor = CpuInfo::Vendor::Unknown; *model_name = "Unknown"; } From 6a6f4911b9dce0a1e0ab3fe149f4241b90d7c7c5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 17 Feb 2023 20:34:37 -0500 Subject: [PATCH 30/80] Fix build error on macos. --- c++/src/BpackingDefault.cc | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc index 50e3e2520f..19ec41fefb 100644 --- a/c++/src/BpackingDefault.cc +++ b/c++/src/BpackingDefault.cc @@ -44,7 +44,7 @@ namespace orc { numGroups = std::min(numGroups, static_cast(decoder->bufferEnd - decoder->bufferStart)); // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->bufferStart); uint32_t localByte; for (uint64_t i = 0; i < numGroups; ++i) { localByte = *buffer++; @@ -52,7 +52,7 @@ namespace orc { data[curIdx + 1] = localByte & 15; curIdx += 2; } - decoder->bufferStart = (char*)buffer; + decoder->bufferStart = reinterpret_cast(buffer); if (curIdx == offset + len) return; // readByte() will update 'bufferStart' and 'bufferEnd' @@ -68,11 +68,11 @@ namespace orc { int64_t bufferNum = decoder->bufferEnd - decoder->bufferStart; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { data[curIdx++] = *buffer++; } - decoder->bufferStart = (char*)buffer; + decoder->bufferStart = reinterpret_cast(buffer); if (curIdx == offset + len) return; // readByte() will update 'bufferStart' and 'bufferEnd'. @@ -88,14 +88,14 @@ namespace orc { bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint16_t b0, b1; // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); buffer += 2; data[curIdx++] = (b0 << 8) | b1; } - decoder->bufferStart = (char*)buffer; + decoder->bufferStart = reinterpret_cast(buffer); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. @@ -113,7 +113,7 @@ namespace orc { bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint32_t b0, b1, b2; // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -140,7 +140,7 @@ namespace orc { bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint32_t b0, b1, b2, b3; // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -149,7 +149,7 @@ namespace orc { buffer += 4; data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); } - decoder->bufferStart = (char*)buffer; + decoder->bufferStart = reinterpret_cast(buffer); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. @@ -169,7 +169,7 @@ namespace orc { bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4; // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -180,7 +180,7 @@ namespace orc { data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); } - decoder->bufferStart = (char*)buffer; + decoder->bufferStart = reinterpret_cast(buffer); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. @@ -201,7 +201,7 @@ namespace orc { bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5; // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -213,7 +213,7 @@ namespace orc { data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); } - decoder->bufferStart = (char*)buffer; + decoder->bufferStart = reinterpret_cast(buffer); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. @@ -236,7 +236,7 @@ namespace orc { bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5, b6; // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -249,7 +249,7 @@ namespace orc { data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); } - decoder->bufferStart = (char*)buffer; + decoder->bufferStart = reinterpret_cast(buffer); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. @@ -273,7 +273,7 @@ namespace orc { bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5, b6, b7; // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->bufferStart); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -287,7 +287,7 @@ namespace orc { data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); } - decoder->bufferStart = (char*)buffer; + decoder->bufferStart = reinterpret_cast(buffer); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. From b2abf44be071fc913f7074c7b2ffa7f95ebb4b7f Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 17 Feb 2023 21:56:35 -0500 Subject: [PATCH 31/80] Fix build error on macos --- c++/src/BpackingAvx512.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index 9646e665ee..a41a5d802c 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -96,7 +96,7 @@ namespace orc { if (numElements >= 64) { __m512i reverseMask1u = _mm512_load_si512(reverseMaskTable1u); while (numElements >= 64) { - uint64_t src_64 = *(uint64_t*)srcPtr; + uint64_t src_64 = *reinterpret_cast(const_cast(srcPtr)); // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF __m512i srcmm = _mm512_movm_epi8(src_64); // make 0x00 --> 0x00, 0xFF --> 0x01 From 36f06aaf22792aae7fb729287c3f47bb5d192886 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 17 Feb 2023 23:46:19 -0500 Subject: [PATCH 32/80] Fix a build error about "%ld" and "%lld" on macos. --- c++/test/TestRleVectorDecoder.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 3ee923fd74..777249b136 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -106,7 +106,12 @@ namespace orc { int32_t lpad = offset * BARWIDTH / total; int32_t rpad = BARWIDTH - lpad; - printf("\r%s:%3d%% [%.*s%*s] [%ld /%ld]", testName, val, lpad, BARSTR, rpad, "", offset, total); +#ifdef __APPLE__ + printf("\r%s:%3d%% [%.*s%*s] [%lld/%lld]", testName, val, lpad, BARSTR, rpad, "", offset, + total); +#else + printf("\r%s:%3d%% [%.*s%*s] [%ld/%ld]", testName, val, lpad, BARSTR, rpad, "", offset, total); +#endif fflush(stdout); } From 15db3d1a923d3e9fe80b2bf68c21181553ecb8fa Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 19 Feb 2023 04:59:59 -0500 Subject: [PATCH 33/80] Use std::cout instead of printf function --- c++/test/TestRleVectorDecoder.cc | 142 ++++++++++++++++--------------- 1 file changed, 75 insertions(+), 67 deletions(-) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 777249b136..d978836a00 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -33,6 +33,8 @@ namespace orc { using ::testing::Values; const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + const char finish = '#'; + std::string flags("-\\|/"); class RleV2BitUnpackAvx512Test : public TestWithParam { virtual void SetUp(); @@ -99,20 +101,26 @@ namespace orc { } } -#define BARSTR "##################################################" -#define BARWIDTH 50 - void testProgress(const char* testName, int64_t offset, int64_t total) { - int32_t val = offset * 100 / total; - int32_t lpad = offset * BARWIDTH / total; - int32_t rpad = BARWIDTH - lpad; + void printBar(const char* testName, int64_t offset, int64_t total) { + int64_t n = offset * 50 / total; + std::string progress(50, '.'); + for (int i = 0; i < n; i++) { + progress[i] = finish; + } -#ifdef __APPLE__ - printf("\r%s:%3d%% [%.*s%*s] [%lld/%lld]", testName, val, lpad, BARSTR, rpad, "", offset, - total); -#else - printf("\r%s:%3d%% [%.*s%*s] [%ld/%ld]", testName, val, lpad, BARSTR, rpad, "", offset, total); -#endif - fflush(stdout); + std::string f, p; + if (n == 50) { + f = "\e[1;32mOK\e[m"; + p = "\e[1;32m100%\e[m"; + } else { + f = flags[n % 4]; + p = std::to_string(n) + '%'; + } + std::cout << std::unitbuf << testName << ":" << '[' << f << ']' << '[' << progress << ']' << '[' + << p << "]" << '\r'; + if (n >= 50) { + std::cout << std::endl; + } } std::unique_ptr RleV2BitUnpackAvx512Test::getEncoder(RleVersion version, @@ -148,7 +156,7 @@ namespace orc { uint8_t bitWidth = 1; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("1bit Test 1st Part", blockSize, 10000); + printBar("1bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -156,7 +164,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("1bit Test 2nd Part", blockSize, 10000); + printBar("1bit Test 1st Part", blockSize, 10000); } printf("\n"); } @@ -165,7 +173,7 @@ namespace orc { uint8_t bitWidth = 2; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("2bit Test 1st Part", blockSize, 10000); + printBar("2bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -173,7 +181,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("2bit Test 2nd Part", blockSize, 10000); + printBar("2bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -182,7 +190,7 @@ namespace orc { uint8_t bitWidth = 3; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("3bit Test 1st Part", blockSize, 10000); + printBar("3bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -190,7 +198,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("3bit Test 2nd Part", blockSize, 10000); + printBar("3bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -199,7 +207,7 @@ namespace orc { uint8_t bitWidth = 4; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("4bit Test 1st Part", blockSize, 10000); + printBar("4bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -207,7 +215,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("4bit Test 2nd Part", blockSize, 10000); + printBar("4bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -216,7 +224,7 @@ namespace orc { uint8_t bitWidth = 5; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("5bit Test 1st Part", blockSize, 10000); + printBar("5bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -224,7 +232,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("5bit Test 2nd Part", blockSize, 10000); + printBar("5bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -233,7 +241,7 @@ namespace orc { uint8_t bitWidth = 6; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("6bit Test 1st Part", blockSize, 10000); + printBar("6bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -241,7 +249,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("6bit Test 2nd Part", blockSize, 10000); + printBar("6bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -250,7 +258,7 @@ namespace orc { uint8_t bitWidth = 7; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("7bit Test 1st Part", blockSize, 10000); + printBar("7bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -258,7 +266,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("7bit Test 2nd Part", blockSize, 10000); + printBar("7bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -268,7 +276,7 @@ namespace orc { for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("9bit Test 1st Part", blockSize, 10000); + printBar("9bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -276,7 +284,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("9bit Test 2nd Part", blockSize, 10000); + printBar("9bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -285,7 +293,7 @@ namespace orc { uint8_t bitWidth = 10; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("10bit Test 1st Part", blockSize, 10000); + printBar("10bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -293,7 +301,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("10bit Test 2nd Part", blockSize, 10000); + printBar("10bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -302,7 +310,7 @@ namespace orc { uint8_t bitWidth = 11; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("11bit Test 1st Part", blockSize, 10000); + printBar("11bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -310,7 +318,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("11bit Test 2nd Part", blockSize, 10000); + printBar("11bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -319,7 +327,7 @@ namespace orc { uint8_t bitWidth = 12; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("12bit Test 1st Part", blockSize, 10000); + printBar("12bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -327,7 +335,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("12bit Test 2nd Part", blockSize, 10000); + printBar("12bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -336,7 +344,7 @@ namespace orc { uint8_t bitWidth = 13; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("13bit Test 1st Part", blockSize, 10000); + printBar("13bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -344,7 +352,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("13bit Test 2nd Part", blockSize, 10000); + printBar("13bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -353,7 +361,7 @@ namespace orc { uint8_t bitWidth = 14; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("14bit Test 1st Part", blockSize, 10000); + printBar("14bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -361,7 +369,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("14bit Test 2nd Part", blockSize, 10000); + printBar("14bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -370,7 +378,7 @@ namespace orc { uint8_t bitWidth = 15; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("15bit Test 1st Part", blockSize, 10000); + printBar("15bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -378,7 +386,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("15bit Test 2nd Part", blockSize, 10000); + printBar("15bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -387,7 +395,7 @@ namespace orc { uint8_t bitWidth = 16; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("16bit Test 1st Part", blockSize, 10000); + printBar("16bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -395,7 +403,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("16bit Test 2nd Part", blockSize, 10000); + printBar("16bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -404,7 +412,7 @@ namespace orc { uint8_t bitWidth = 17; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("17bit Test 1st Part", blockSize, 10000); + printBar("17bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -412,7 +420,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("17bit Test 2nd Part", blockSize, 10000); + printBar("17bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -421,7 +429,7 @@ namespace orc { uint8_t bitWidth = 18; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("18bit Test 1st Part", blockSize, 10000); + printBar("18bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -429,7 +437,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("18bit Test 2nd Part", blockSize, 10000); + printBar("18bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -438,7 +446,7 @@ namespace orc { uint8_t bitWidth = 19; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("19bit Test 1st Part", blockSize, 10000); + printBar("19bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -446,7 +454,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("19bit Test 2nd Part", blockSize, 10000); + printBar("19bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -455,7 +463,7 @@ namespace orc { uint8_t bitWidth = 20; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("20bit Test 1st Part", blockSize, 10000); + printBar("20bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -463,7 +471,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("20bit Test 2nd Part", blockSize, 10000); + printBar("20bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -472,7 +480,7 @@ namespace orc { uint8_t bitWidth = 21; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("21bit Test 1st Part", blockSize, 10000); + printBar("21bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -480,7 +488,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("21bit Test 2nd Part", blockSize, 10000); + printBar("21bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -489,7 +497,7 @@ namespace orc { uint8_t bitWidth = 22; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("22bit Test 1st Part", blockSize, 10000); + printBar("22bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -497,7 +505,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("22bit Test 2nd Part", blockSize, 10000); + printBar("22bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -507,7 +515,7 @@ namespace orc { runTest(RleVersion_2, 3277, 0, 0, true, false, bitWidth, 108); for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("23bit Test 1st Part", blockSize, 10000); + printBar("23bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -515,7 +523,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("23bit Test 2nd Part", blockSize, 10000); + printBar("23bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -524,7 +532,7 @@ namespace orc { uint8_t bitWidth = 24; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("24bit Test 1st Part", blockSize, 10000); + printBar("24bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -532,7 +540,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("24bit Test 2nd Part", blockSize, 10000); + printBar("24bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -541,7 +549,7 @@ namespace orc { uint8_t bitWidth = 26; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("26bit Test 1st Part", blockSize, 10000); + printBar("26bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -549,7 +557,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("26bit Test 2nd Part", blockSize, 10000); + printBar("26bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -558,7 +566,7 @@ namespace orc { uint8_t bitWidth = 28; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("28bit Test 1st Part", blockSize, 10000); + printBar("28bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -566,7 +574,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("28bit Test 2nd Part", blockSize, 10000); + printBar("28bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -575,7 +583,7 @@ namespace orc { uint8_t bitWidth = 30; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("30bit Test 1st Part", blockSize, 10000); + printBar("30bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -583,7 +591,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("30bit Test 2nd Part", blockSize, 10000); + printBar("30bit Test 2nd Part", blockSize, 10000); } printf("\n"); } @@ -592,7 +600,7 @@ namespace orc { uint8_t bitWidth = 32; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); - testProgress("32bit Test 1st Part", blockSize, 10000); + printBar("32bit Test 1st Part", blockSize, 10000); } printf("\n"); @@ -600,7 +608,7 @@ namespace orc { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - testProgress("32bit Test 2nd Part", blockSize, 10000); + printBar("32bit Test 2nd Part", blockSize, 10000); } printf("\n"); } From 42cc70319c38e20afdba53e40a0b871e0aa5d6bc Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 20 Feb 2023 17:11:17 -0500 Subject: [PATCH 34/80] Fix build error on macos. --- c++/test/TestRleVectorDecoder.cc | 70 ++++---------------------------- 1 file changed, 8 insertions(+), 62 deletions(-) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index d978836a00..3d67c321d2 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -34,7 +34,7 @@ namespace orc { const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M const char finish = '#'; - std::string flags("-\\|/"); + std::string flags = "-\\|/"; class RleV2BitUnpackAvx512Test : public TestWithParam { virtual void SetUp(); @@ -102,23 +102,23 @@ namespace orc { } void printBar(const char* testName, int64_t offset, int64_t total) { - int64_t n = offset * 50 / total; - std::string progress(50, '.'); + int64_t n = offset * 100 / total; + std::string progress(100, '.'); for (int i = 0; i < n; i++) { progress[i] = finish; } std::string f, p; - if (n == 50) { - f = "\e[1;32mOK\e[m"; - p = "\e[1;32m100%\e[m"; + if (n == 100) { + f = "OK"; + p = "100%"; } else { f = flags[n % 4]; p = std::to_string(n) + '%'; } std::cout << std::unitbuf << testName << ":" << '[' << f << ']' << '[' << progress << ']' << '[' << p << "]" << '\r'; - if (n >= 50) { + if (n >= 100) { std::cout << std::endl; } } @@ -158,15 +158,13 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("1bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); } - printBar("1bit Test 1st Part", blockSize, 10000); + printBar("1bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_2bit) { @@ -175,7 +173,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("2bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -183,7 +180,6 @@ namespace orc { } printBar("2bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_3bit) { @@ -192,7 +188,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("3bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -200,7 +195,6 @@ namespace orc { } printBar("3bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_4bit) { @@ -209,7 +203,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("4bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -217,7 +210,6 @@ namespace orc { } printBar("4bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_5bit) { @@ -226,7 +218,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("5bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -234,7 +225,6 @@ namespace orc { } printBar("5bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_6bit) { @@ -243,7 +233,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("6bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -251,7 +240,6 @@ namespace orc { } printBar("6bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_7bit) { @@ -260,7 +248,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("7bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -268,7 +255,6 @@ namespace orc { } printBar("7bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_9bit) { @@ -278,7 +264,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("9bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -286,7 +271,6 @@ namespace orc { } printBar("9bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_10bit) { @@ -295,7 +279,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("10bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -303,7 +286,6 @@ namespace orc { } printBar("10bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_11bit) { @@ -312,7 +294,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("11bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -320,7 +301,6 @@ namespace orc { } printBar("11bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_12bit) { @@ -329,7 +309,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("12bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -337,7 +316,6 @@ namespace orc { } printBar("12bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_13bit) { @@ -346,7 +324,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("13bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -354,7 +331,6 @@ namespace orc { } printBar("13bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_14bit) { @@ -363,7 +339,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("14bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -371,7 +346,6 @@ namespace orc { } printBar("14bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_15bit) { @@ -380,7 +354,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("15bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -388,7 +361,6 @@ namespace orc { } printBar("15bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_16bit) { @@ -397,7 +369,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("16bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -405,7 +376,6 @@ namespace orc { } printBar("16bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_17bit) { @@ -414,7 +384,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("17bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -422,7 +391,6 @@ namespace orc { } printBar("17bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_18bit) { @@ -431,7 +399,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("18bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -439,7 +406,6 @@ namespace orc { } printBar("18bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_19bit) { @@ -448,7 +414,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("19bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -456,7 +421,6 @@ namespace orc { } printBar("19bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_20bit) { @@ -465,7 +429,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("20bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -473,7 +436,6 @@ namespace orc { } printBar("20bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_21bit) { @@ -482,7 +444,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("21bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -490,7 +451,6 @@ namespace orc { } printBar("21bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_22bit) { @@ -499,7 +459,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("22bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -507,7 +466,6 @@ namespace orc { } printBar("22bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_23bit) { @@ -517,7 +475,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("23bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -525,7 +482,6 @@ namespace orc { } printBar("23bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_24bit) { @@ -534,7 +490,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("24bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -542,7 +497,6 @@ namespace orc { } printBar("24bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_26bit) { @@ -551,7 +505,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("26bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -559,7 +512,6 @@ namespace orc { } printBar("26bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_28bit) { @@ -568,7 +520,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("28bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -576,7 +527,6 @@ namespace orc { } printBar("28bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_30bit) { @@ -585,7 +535,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("30bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -593,7 +542,6 @@ namespace orc { } printBar("30bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_32bit) { @@ -602,7 +550,6 @@ namespace orc { runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); printBar("32bit Test 1st Part", blockSize, 10000); } - printf("\n"); for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { @@ -610,7 +557,6 @@ namespace orc { } printBar("32bit Test 2nd Part", blockSize, 10000); } - printf("\n"); } INSTANTIATE_TEST_SUITE_P(OrcTest, RleV2BitUnpackAvx512Test, Values(true, false)); From 9d86e3ddc52f037508fa61a62cf123937ff7224f Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Mar 2023 04:18:49 -0500 Subject: [PATCH 35/80] Macos doesn't support AVX512 fully. So skip Macos to support AVX512 decode. --- CMakeLists.txt | 4 +++- c++/src/CpuInfoUtil.cc | 6 ++++++ cmake_modules/ConfigSimdLevel.cmake | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb759d1a51..2b2d3e99eb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,7 +173,9 @@ enable_testing() INCLUDE(CheckSourceCompiles) INCLUDE(ThirdpartyToolchain) -INCLUDE(ConfigSimdLevel) +if (BUILD_ENABLE_AVX512 AND NOT APPLE) + INCLUDE(ConfigSimdLevel) +endif () set (EXAMPLE_DIRECTORY ${CMAKE_SOURCE_DIR}/examples) diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index dcec9198a6..954675473c 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -60,6 +60,10 @@ #define CPUINFO_ARCH_PPC #endif +#ifndef ORC_HAVE_RUNTIME_AVX512 +#define UNUSED(x) (void)(x) +#endif + namespace orc { namespace { @@ -458,6 +462,8 @@ namespace orc { if (!ci->isDetected(CpuInfo::AVX512)) { throw ParseError("CPU does not support the Supplemental AVX512 instruction set"); } +#else + UNUSED(ci); #endif } diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 6f3c6c3a18..8704bfbeec 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -36,7 +36,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(ORC_AVX512_FLAG "/arch:AVX512") else() # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ - set(ORC_AVX512_FLAG "-march=native -mbmi2 -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi") + set(ORC_AVX512_FLAG "-march=native -mtune=native") endif() check_cxx_compiler_flag(${ORC_AVX512_FLAG} CXX_SUPPORTS_AVX512) if(MINGW) From 75e4cfad0e70f91ac1c3a71f864e7754e1b8b30e Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Mar 2023 04:41:07 -0500 Subject: [PATCH 36/80] Add the comments about arch=native compile option. --- cmake_modules/ConfigSimdLevel.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 8704bfbeec..30a2e26d02 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -35,7 +35,8 @@ if(ORC_CPU_FLAG STREQUAL "x86") if(MSVC) set(ORC_AVX512_FLAG "/arch:AVX512") else() - # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ + # "arch=native" selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. + # Using -march=native enables all instruction subsets supported by the local machine. set(ORC_AVX512_FLAG "-march=native -mtune=native") endif() check_cxx_compiler_flag(${ORC_AVX512_FLAG} CXX_SUPPORTS_AVX512) From 197f2e6c855fcd4a226f0a7707bf88724583ff33 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 2 Mar 2023 04:20:48 -0500 Subject: [PATCH 37/80] Add the cpu flags information in the cmake process. --- c++/test/TestRleVectorDecoder.cc | 1 + cmake_modules/ConfigSimdLevel.cmake | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 3d67c321d2..9116b871a7 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -24,6 +24,7 @@ #include "wrap/orc-proto-wrapper.hh" #ifdef __clang__ +DIAGNOSTIC_IGNORE("-Winconsistent-missing-override") DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") #endif diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 30a2e26d02..0f082a4241 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -64,6 +64,13 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) endif() + if(BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512 AND NOT MSVC) + execute_process(COMMAND grep flags /proc/cpuinfo + COMMAND head -1 + OUTPUT_VARIABLE flags_ver) + message(STATUS "CPU ${flags_ver}") + endif() + message(STATUS "BUILD_ENABLE_AVX512: ${BUILD_ENABLE_AVX512}") # Runtime SIMD level it can get from compiler if(BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512) From 1d050af3f7b3af7cea76e1d943f7e7085d7c1fa0 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 2 Mar 2023 18:37:31 -0500 Subject: [PATCH 38/80] Modified the cmake check of supoorting AVX512. --- c++/test/TestRleVectorDecoder.cc | 4 +--- cmake_modules/ConfigSimdLevel.cmake | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 9116b871a7..bcc05d9d7a 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -24,12 +24,11 @@ #include "wrap/orc-proto-wrapper.hh" #ifdef __clang__ -DIAGNOSTIC_IGNORE("-Winconsistent-missing-override") DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") #endif namespace orc { - +#if defined(ORC_HAVE_RUNTIME_AVX512) using ::testing::TestWithParam; using ::testing::Values; @@ -152,7 +151,6 @@ namespace orc { delete[] notNull; } -#if defined(ORC_HAVE_RUNTIME_AVX512) TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_1bit) { uint8_t bitWidth = 1; for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 0f082a4241..283e8d1ae6 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -39,7 +39,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") # Using -march=native enables all instruction subsets supported by the local machine. set(ORC_AVX512_FLAG "-march=native -mtune=native") endif() - check_cxx_compiler_flag(${ORC_AVX512_FLAG} CXX_SUPPORTS_AVX512) + if(MINGW) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 message(STATUS "Disable AVX512 support on MINGW for now") @@ -47,7 +47,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") # Check for AVX512 support in the compiler. set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ORC_AVX512_FLAG}") - check_cxx_source_compiles(" + CHECK_CXX_SOURCE_COMPILES(" #ifdef _MSC_VER #include #else From a239e47ba777a0c87c84af98a3073606f13c9b18 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 2 Mar 2023 23:37:37 -0500 Subject: [PATCH 39/80] When user set BUILD_ENABLE_AVX512=on, but the compiler cannot support AVX512, it will print the warning instead of error in the cmake process. --- cmake_modules/ConfigSimdLevel.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 283e8d1ae6..f72bbcd375 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -79,7 +79,9 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(ORC_SIMD_LEVEL "AVX512") add_definitions(-DORC_HAVE_RUNTIME_AVX512) elseif(BUILD_ENABLE_AVX512 AND NOT CXX_SUPPORTS_AVX512) - message(FATAL_ERROR "AVX512 required but compiler doesn't support it.") + message(WARNING "AVX512 required but compiler doesn't support it.") + message(WARNING "Failed to enable the AVX512 vector decode of bit-packing") + set(ORC_HAVE_RUNTIME_AVX512 OFF) elseif(NOT BUILD_ENABLE_AVX512) set(ORC_HAVE_RUNTIME_AVX512 OFF) message(STATUS "Disable the AVX512 vector decode of bit-packing") From b2b6aff7ef464f3d970cbf76b5eb32cda53165d1 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 3 Mar 2023 15:54:27 -0500 Subject: [PATCH 40/80] Add the comment about -mtune=native in cmake process. --- cmake_modules/ConfigSimdLevel.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index f72bbcd375..b763cc0ae5 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -37,6 +37,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") else() # "arch=native" selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. # Using -march=native enables all instruction subsets supported by the local machine. + # Using -mtune=native produces code optimized for the local machine under the constraints of the selected instruction set. set(ORC_AVX512_FLAG "-march=native -mtune=native") endif() From d7112e92906845cc4f52d25a720ba4c5303939d9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 6 Mar 2023 16:26:16 -0500 Subject: [PATCH 41/80] 1.Add the new CI action to test AVX512 feature. 2.Change the default value of BUILD_ENABLE_AVX512 to OFF in make file 3.Change the warning message about AVX512 enabling to fatal error message --- .github/workflows/build_and_test.yml | 44 ++++++++++++++++++++++++++++ CMakeLists.txt | 2 +- cmake_modules/ConfigSimdLevel.cmake | 3 +- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8505235ef7..7d0d2bd8cc 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -91,6 +91,50 @@ jobs: cmake --build . --config Debug ctest -C Debug --output-on-failure + simdUbuntu: + name: "SIMD programming using C++ intrinsic functions on ${{ matrix.os }}" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-20.04 + - ubuntu-22.04 + cxx: + - clang++ + env: + ORC_USER_SIMD_LEVEL: avx512 + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: "Test" + run: | + mkdir -p ~/.m2 + mkdir build + cd build + cmake -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON .. + make package test-out + + simdWindows: + name: "SIMD programming using C++ intrinsic functions on Windows" + runs-on: windows-2019 + env: + ORC_USER_SIMD_LEVEL: avx512 + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + with: + msbuild-architecture: x64 + - name: "Test" + run: | + mkdir build + cd build + cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Rlease -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON + cmake --build . --config Debug + ctest -C Debug --output-on-failure + doc: name: "Javadoc generation" runs-on: ubuntu-20.04 diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b2d3e99eb..66795e9d53 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,7 @@ option(BUILD_CPP_ENABLE_METRICS option(BUILD_ENABLE_AVX512 "Enable build with AVX512 at compile time" - ON) + OFF) # Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index b763cc0ae5..8cfc3d817b 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -80,8 +80,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(ORC_SIMD_LEVEL "AVX512") add_definitions(-DORC_HAVE_RUNTIME_AVX512) elseif(BUILD_ENABLE_AVX512 AND NOT CXX_SUPPORTS_AVX512) - message(WARNING "AVX512 required but compiler doesn't support it.") - message(WARNING "Failed to enable the AVX512 vector decode of bit-packing") + message(FATAL_ERROR "AVX512 required but compiler doesn't support it, failed to enable AVX512.") set(ORC_HAVE_RUNTIME_AVX512 OFF) elseif(NOT BUILD_ENABLE_AVX512) set(ORC_HAVE_RUNTIME_AVX512 OFF) From 5b38980e88e436d8342b96bdd860cb9bf5e4a061 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 6 Mar 2023 17:44:05 -0500 Subject: [PATCH 42/80] Change the build_type back to Debug, keep consistent with the original. --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7d0d2bd8cc..c83b7ef035 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -131,7 +131,7 @@ jobs: run: | mkdir build cd build - cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Rlease -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON + cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON cmake --build . --config Debug ctest -C Debug --output-on-failure From 6768165552ee7174f7738cb0d17a3159d18f12cc Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 8 Mar 2023 20:33:40 -0500 Subject: [PATCH 43/80] Fix an error about _mm512_load_si512 on some CPU core when running with debug binary. --- c++/src/BpackingAvx512.cc | 316 +++++++++++++++++++------------------- 1 file changed, 158 insertions(+), 158 deletions(-) diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index a41a5d802c..e4c04701f4 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -94,7 +94,7 @@ namespace orc { } if (numElements >= 64) { - __m512i reverseMask1u = _mm512_load_si512(reverseMaskTable1u); + __m512i reverseMask1u = _mm512_loadu_si512(reverseMaskTable1u); while (numElements >= 64) { uint64_t src_64 = *reinterpret_cast(const_cast(srcPtr)); // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF @@ -362,15 +362,15 @@ namespace orc { __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable3u); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable3u); __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable3u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable3u_1); + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable3u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable3u_1); __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable3u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable3u_1); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable3u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable3u_1); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -638,15 +638,15 @@ namespace orc { __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable5u); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable5u); __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable5u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable5u_1); + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable5u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable5u_1); __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable5u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable5u_1); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable5u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable5u_1); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -782,15 +782,15 @@ namespace orc { __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable6u); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable6u); __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable6u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable6u_1); + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable6u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable6u_1); __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable6u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable6u_1); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable6u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable6u_1); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -926,15 +926,15 @@ namespace orc { __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable7u); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable7u); __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable7u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable7u_1); + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable7u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable7u_1); __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable7u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable7u_1); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable7u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable7u_1); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -1069,22 +1069,22 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_loadu_si512(reverseMaskTable16u); __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable9u_0); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable9u_0); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable9u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable9u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable9u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable9u_1); __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable9u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable9u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable9u_2); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable9u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable9u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable9u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable9u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable9u); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -1264,9 +1264,9 @@ namespace orc { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable10u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable10u); - __m512i shiftMask = _mm512_load_si512(shiftTable10u); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable10u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable10u); + __m512i shiftMask = _mm512_loadu_si512(shiftTable10u); while (numElements >= 32) { __m512i srcmm, zmm; @@ -1395,25 +1395,25 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u); __m512i maskmm = _mm512_set1_epi8(0x0F); __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable11u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable11u_1); + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable11u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable11u_1); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable11u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable11u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable11u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable11u_1); __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable11u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable11u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable11u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable11u_3); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable11u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable11u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable11u_2); + shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable11u_3); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable11u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable11u); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -1599,9 +1599,9 @@ namespace orc { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable12u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable12u); - __m512i shiftMask = _mm512_load_si512(shiftTable12u); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable12u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable12u); + __m512i shiftMask = _mm512_loadu_si512(shiftTable12u); while (numElements >= 32) { __m512i srcmm, zmm; @@ -1730,25 +1730,25 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u); __m512i maskmm = _mm512_set1_epi8(0x0F); __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable13u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable13u_1); + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable13u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable13u_1); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable13u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable13u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable13u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable13u_1); __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable13u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable13u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable13u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable13u_3); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable13u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable13u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable13u_2); + shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable13u_3); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable13u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable13u); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -1935,14 +1935,14 @@ namespace orc { __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable14u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable14u_1); + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable14u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable14u_1); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable14u); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable14u); __m512i shiftMaskPtr[2]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable14u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable14u_1); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable14u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable14u_1); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -2077,25 +2077,25 @@ namespace orc { if (numElements >= 32) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask16u = _mm512_load_si512(reverseMaskTable16u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_loadu_si512(reverseMaskTable16u); __m512i maskmm = _mm512_set1_epi8(0x0F); __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable15u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable15u_1); + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable15u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable15u_1); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable15u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable15u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable15u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable15u_1); __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable15u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable15u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable15u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable15u_3); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable15u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable15u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable15u_2); + shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable15u_3); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable15u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable15u); while (numElements >= 64) { __m512i srcmm, zmm[2]; @@ -2245,7 +2245,7 @@ namespace orc { } if (numElements >= 32) { - __m512i reverse_mask_16u = _mm512_load_si512(reverseMaskTable16u); + __m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u); while (numElements >= 32) { __m512i srcmm = _mm512_loadu_si512(srcPtr); srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u); @@ -2360,22 +2360,22 @@ namespace orc { if (numElements >= 16) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable17u_0); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable17u_0); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable17u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable17u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable17u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable17u_1); __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable17u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable17u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable17u_2); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable17u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable17u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable17u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable17u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable17u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -2554,22 +2554,22 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable18u_0); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable18u_0); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable18u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable18u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable18u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable18u_1); __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable18u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable18u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable18u_2); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable18u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable18u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable18u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable18u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable18u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -2748,22 +2748,22 @@ namespace orc { if (numElements >= 16) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable19u_0); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable19u_0); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable19u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable19u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable19u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable19u_1); __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable19u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable19u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable19u_2); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable19u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable19u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable19u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable19u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable19u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -2943,9 +2943,9 @@ namespace orc { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable20u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable20u); - __m512i shiftMask = _mm512_load_si512(shiftTable20u); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable20u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable20u); + __m512i shiftMask = _mm512_loadu_si512(shiftTable20u); while (numElements >= 16u) { __m512i srcmm, zmm; @@ -3074,22 +3074,22 @@ namespace orc { if (numElements >= 16) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable21u_0); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable21u_0); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable21u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable21u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable21u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable21u_1); __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable21u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable21u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable21u_2); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable21u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable21u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable21u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable21u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable21u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -3268,22 +3268,22 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable22u_0); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable22u_0); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable22u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable22u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable22u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable22u_1); __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable22u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable22u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable22u_2); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable22u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable22u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable22u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable22u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable22u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -3463,22 +3463,22 @@ namespace orc { if (numElements >= 16) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable23u_0); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable23u_0); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable23u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable23u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable23u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable23u_1); __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable23u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable23u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable23u_2); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable23u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable23u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable23u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable23u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable23u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -3624,8 +3624,8 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); - __m512i shuffleIdx = _mm512_load_si512(shuffleIdxTable24u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable24u); + __m512i shuffleIdx = _mm512_loadu_si512(shuffleIdxTable24u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable24u); while (numElements >= 16) { __m512i srcmm, zmm; @@ -3746,22 +3746,22 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); __m512i maskmm = _mm512_set1_epi8(0x0F); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable26u_0); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable26u_0); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable26u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable26u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable26u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable26u_1); __m512i shiftMaskPtr[3]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable26u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable26u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable26u_2); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable26u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable26u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable26u_2); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable26u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable26u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -3941,9 +3941,9 @@ namespace orc { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i shuffleIdxPtr = _mm512_load_si512(shuffleIdxTable28u_0); - __m512i permutexIdx = _mm512_load_si512(permutexIdxTable28u); - __m512i shiftMask = _mm512_load_si512(shiftTable28u); + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable28u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable28u); + __m512i shiftMask = _mm512_loadu_si512(shiftTable28u); while (numElements >= 16) { __m512i srcmm, zmm; @@ -4072,25 +4072,25 @@ namespace orc { if (numElements >= 16) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); - __m512i nibbleReversemm = _mm512_load_si512(nibbleReverseTable); - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); __m512i maskmm = _mm512_set1_epi8(0x0F); __m512i shuffleIdxPtr[2]; - shuffleIdxPtr[0] = _mm512_load_si512(shuffleIdxTable30u_0); - shuffleIdxPtr[1] = _mm512_load_si512(shuffleIdxTable30u_1); + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable30u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable30u_1); __m512i permutexIdxPtr[2]; - permutexIdxPtr[0] = _mm512_load_si512(permutexIdxTable30u_0); - permutexIdxPtr[1] = _mm512_load_si512(permutexIdxTable30u_1); + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable30u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable30u_1); __m512i shiftMaskPtr[4]; - shiftMaskPtr[0] = _mm512_load_si512(shiftTable30u_0); - shiftMaskPtr[1] = _mm512_load_si512(shiftTable30u_1); - shiftMaskPtr[2] = _mm512_load_si512(shiftTable30u_2); - shiftMaskPtr[3] = _mm512_load_si512(shiftTable30u_3); + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable30u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable30u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable30u_2); + shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable30u_3); - __m512i gatherIdxmm = _mm512_load_si512(gatherIdxTable30u); + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable30u); while (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -4239,7 +4239,7 @@ namespace orc { } if (numElements >= 16) { - __m512i reverseMask32u = _mm512_load_si512(reverseMaskTable32u); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); while (numElements >= 16) { __m512i srcmm = _mm512_loadu_si512(srcPtr); srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u); From ce7f6deba95afd3307a6783fc609b564e08728ba Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 10 Mar 2023 23:11:33 -0500 Subject: [PATCH 44/80] Most hotspot of function RleDecoderV2::resetBufferStart locates in saving stack, so inline this function to have performance gain. --- c++/src/RLEv2.hh | 33 +++++++++++++++++++++++++++++++-- c++/src/RleDecoderV2.cc | 24 ------------------------ 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index c56ef7cfa3..de7fb30a02 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -167,8 +167,13 @@ namespace orc { void next(int16_t* data, uint64_t numValues, const char* notNull) override; unsigned char readByte(char** bufStart, char** bufEnd); - void resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, bool resetBuf, - uint32_t backupLen); + + /** + * Most hotspot of this function locates in saving stack, so inline this function to have + * performance gain. + */ + inline void resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, bool resetBuf, + uint32_t backupLen); char* bufferStart; char* bufferEnd; @@ -222,6 +227,30 @@ namespace orc { DataBuffer unpackedPatch; // Used by PATCHED_BASE DataBuffer literals; // Values of the current run }; + + void RleDecoderV2::resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, bool resetBuf, + uint32_t backupByteLen) { + uint64_t remainingLen = *bufEnd - *bufStart; + int bufferLength = 0; + const void* bufferPointer = nullptr; + + if (backupByteLen != 0) { + inputStream->BackUp(backupByteLen); + } + + if (len >= remainingLen && resetBuf == true) { + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::resetBufferStart"); + } + } + + if (bufferPointer == nullptr) { + *bufStart += len; + } else { + *bufStart = const_cast(static_cast(bufferPointer)); + *bufEnd = *bufStart + bufferLength; + } + } } // namespace orc #endif // ORC_RLEV2_HH diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 94a70c2bce..516f18d6c4 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -29,30 +29,6 @@ namespace orc { - void RleDecoderV2::resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, bool resetBuf, - uint32_t backupByteLen) { - uint64_t remainingLen = *bufEnd - *bufStart; - int bufferLength = 0; - const void* bufferPointer = nullptr; - - if (backupByteLen != 0) { - inputStream->BackUp(backupByteLen); - } - - if (len >= remainingLen && resetBuf == true) { - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in RleDecoderV2::resetBufferStart"); - } - } - - if (bufferPointer == nullptr) { - *bufStart += len; - } else { - *bufStart = const_cast(static_cast(bufferPointer)); - *bufEnd = *bufStart + bufferLength; - } - } - unsigned char RleDecoderV2::readByte(char** bufStart, char** bufEnd) { SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); if (*bufStart == *bufEnd) { From 8f6806b0afa1b7542bef15cdbdb16c9d21032151 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 14 Mar 2023 14:38:53 -0400 Subject: [PATCH 45/80] Modified some cmake options and status message --- .github/workflows/build_and_test.yml | 1 - CMakeLists.txt | 6 ++++++ c++/src/Bpacking.hh | 2 +- cmake_modules/ConfigSimdLevel.cmake | 23 +++++------------------ 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index c83b7ef035..019351fb72 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -98,7 +98,6 @@ jobs: fail-fast: false matrix: os: - - ubuntu-20.04 - ubuntu-22.04 cxx: - clang++ diff --git a/CMakeLists.txt b/CMakeLists.txt index 66795e9d53..69a6af4ab6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,6 +173,12 @@ enable_testing() INCLUDE(CheckSourceCompiles) INCLUDE(ThirdpartyToolchain) +message(STATUS "BUILD_ENABLE_AVX512: ${BUILD_ENABLE_AVX512}") +# +# macOS doesn't fully support AVX512, it has a different way dealing with AVX512 than Windows and Linux. +# +# Here can find the description: +# https://github.com/apple/darwin-xnu/blob/0a798f6738bc1db01281fc08ae024145e84df927/osfmk/i386/fpu.c#L176 if (BUILD_ENABLE_AVX512 AND NOT APPLE) INCLUDE(ConfigSimdLevel) endif () diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh index 4aa1e1fc88..4a857be8b5 100644 --- a/c++/src/Bpacking.hh +++ b/c++/src/Bpacking.hh @@ -19,7 +19,7 @@ #ifndef ORC_BPACKING_HH #define ORC_BPACKING_HH -#include +#include #include "RLEv2.hh" diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 8cfc3d817b..0d6ad35eb4 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -22,10 +22,8 @@ endif() if(NOT DEFINED ORC_CPU_FLAG) if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|X86|x86|i[3456]86|x64") set(ORC_CPU_FLAG "x86") - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm$|armv[4-7]") - set(ORC_CPU_FLAG "aarch32") else() - message(STATUS "Unknown system processor") + message(STATUS "Unsupported system processor for SIMD optimization") endif() endif() @@ -65,40 +63,29 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) endif() - if(BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512 AND NOT MSVC) + if(CXX_SUPPORTS_AVX512 AND NOT MSVC) execute_process(COMMAND grep flags /proc/cpuinfo COMMAND head -1 OUTPUT_VARIABLE flags_ver) message(STATUS "CPU ${flags_ver}") endif() - message(STATUS "BUILD_ENABLE_AVX512: ${BUILD_ENABLE_AVX512}") # Runtime SIMD level it can get from compiler - if(BUILD_ENABLE_AVX512 AND CXX_SUPPORTS_AVX512) - message(STATUS "Enable the AVX512 vector decode of bit-packing, compiler support AVX512") + if(CXX_SUPPORTS_AVX512) + message(STATUS "Enabled the AVX512 for RLE bit-unpacking") set(ORC_HAVE_RUNTIME_AVX512 ON) set(ORC_SIMD_LEVEL "AVX512") add_definitions(-DORC_HAVE_RUNTIME_AVX512) - elseif(BUILD_ENABLE_AVX512 AND NOT CXX_SUPPORTS_AVX512) + else() message(FATAL_ERROR "AVX512 required but compiler doesn't support it, failed to enable AVX512.") set(ORC_HAVE_RUNTIME_AVX512 OFF) - elseif(NOT BUILD_ENABLE_AVX512) - set(ORC_HAVE_RUNTIME_AVX512 OFF) - message(STATUS "Disable the AVX512 vector decode of bit-packing") endif() if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") set(ORC_SIMD_LEVEL "NONE") endif() - # Enable additional instruction sets if they are supported - if(MINGW) - # Enable _xgetbv() intrinsic to query OS support for ZMM register saves - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mxsave") - endif() if(ORC_SIMD_LEVEL STREQUAL "AVX512") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_AVX512_FLAG}") - elseif(NOT ORC_SIMD_LEVEL STREQUAL "NONE") - message(WARNING "ORC_SIMD_LEVEL=${ORC_SIMD_LEVEL} not supported by x86.") endif() endif() From e27be9ec4cfef8e5482708124ba93138efd7e7ad Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 16 Mar 2023 08:53:56 -0400 Subject: [PATCH 46/80] Delete macro ORC_HAVE_RUNTIME_AVX512. Modified CMakeLists.txt to choose if build AVX512 file. delete file Bpacking.cc. --- c++/src/BitUnpackerAvx512.hh | 3 - c++/src/Bpacking.cc | 216 ------------------------------- c++/src/Bpacking.hh | 12 +- c++/src/BpackingAvx512.cc | 151 ++++++++++++++++++++- c++/src/BpackingAvx512.hh | 8 +- c++/src/BpackingDefault.cc | 39 ++++++ c++/src/BpackingDefault.hh | 6 + c++/src/CMakeLists.txt | 10 +- c++/src/CpuInfoUtil.cc | 1 - c++/src/RleDecoderV2.cc | 15 ++- c++/test/CMakeLists.txt | 6 +- c++/test/TestRleVectorDecoder.cc | 2 - 12 files changed, 221 insertions(+), 248 deletions(-) delete mode 100644 c++/src/Bpacking.cc diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh index dbc582cfae..1e524e4e98 100644 --- a/c++/src/BitUnpackerAvx512.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -19,8 +19,6 @@ #ifndef ORC_BIT_UNPACKER_AVX512_HH #define ORC_BIT_UNPACKER_AVX512_HH -#if defined(ORC_HAVE_RUNTIME_AVX512) - // Mingw-w64 defines strcasecmp in string.h #if defined(_WIN32) && !defined(strcasecmp) #include @@ -486,5 +484,4 @@ namespace orc { } } // namespace orc -#endif // #if defined(ORC_HAVE_RUNTIME_AVX512) #endif diff --git a/c++/src/Bpacking.cc b/c++/src/Bpacking.cc deleted file mode 100644 index 64f11f013f..0000000000 --- a/c++/src/Bpacking.cc +++ /dev/null @@ -1,216 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Bpacking.hh" -#include "BpackingDefault.hh" -#include "CpuInfoUtil.hh" -#if defined(ORC_HAVE_RUNTIME_AVX512) -#include "BpackingAvx512.hh" -#endif - -namespace orc { - int readLongsDefault(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs) { - UnpackDefault unpackDefault(decoder); - switch (fbs) { - case 4: - unpackDefault.unrolledUnpack4(data, offset, len); - break; - case 8: - unpackDefault.unrolledUnpack8(data, offset, len); - break; - case 16: - unpackDefault.unrolledUnpack16(data, offset, len); - break; - case 24: - unpackDefault.unrolledUnpack24(data, offset, len); - break; - case 32: - unpackDefault.unrolledUnpack32(data, offset, len); - break; - case 40: - unpackDefault.unrolledUnpack40(data, offset, len); - break; - case 48: - unpackDefault.unrolledUnpack48(data, offset, len); - break; - case 56: - unpackDefault.unrolledUnpack56(data, offset, len); - break; - case 64: - unpackDefault.unrolledUnpack64(data, offset, len); - break; - default: - // Fallback to the default implementation for deprecated bit size. - unpackDefault.plainUnpackLongs(data, offset, len, fbs); - break; - } - return 0; - } - -#if defined(ORC_HAVE_RUNTIME_AVX512) - int readLongsAvx512(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs) { - UnpackAvx512 unpackAvx512(decoder); - UnpackDefault unpackDefault(decoder); - uint64_t startBit = 0; - static const auto cpu_info = CpuInfo::getInstance(); - if (cpu_info->isSupported(CpuInfo::AVX512)) { - switch (fbs) { - case 1: - unpackAvx512.vectorUnpack1(data, offset, len); - break; - case 2: - unpackAvx512.vectorUnpack2(data, offset, len); - break; - case 3: - unpackAvx512.vectorUnpack3(data, offset, len); - break; - case 4: - unpackAvx512.vectorUnpack4(data, offset, len); - break; - case 5: - unpackAvx512.vectorUnpack5(data, offset, len); - break; - case 6: - unpackAvx512.vectorUnpack6(data, offset, len); - break; - case 7: - unpackAvx512.vectorUnpack7(data, offset, len); - break; - case 8: - unpackDefault.unrolledUnpack8(data, offset, len); - break; - case 9: - unpackAvx512.vectorUnpack9(data, offset, len); - break; - case 10: - unpackAvx512.vectorUnpack10(data, offset, len); - break; - case 11: - unpackAvx512.vectorUnpack11(data, offset, len); - break; - case 12: - unpackAvx512.vectorUnpack12(data, offset, len); - break; - case 13: - unpackAvx512.vectorUnpack13(data, offset, len); - break; - case 14: - unpackAvx512.vectorUnpack14(data, offset, len); - break; - case 15: - unpackAvx512.vectorUnpack15(data, offset, len); - break; - case 16: - unpackAvx512.vectorUnpack16(data, offset, len); - break; - case 17: - unpackAvx512.vectorUnpack17(data, offset, len); - break; - case 18: - unpackAvx512.vectorUnpack18(data, offset, len); - break; - case 19: - unpackAvx512.vectorUnpack19(data, offset, len); - break; - case 20: - unpackAvx512.vectorUnpack20(data, offset, len); - break; - case 21: - unpackAvx512.vectorUnpack21(data, offset, len); - break; - case 22: - unpackAvx512.vectorUnpack22(data, offset, len); - break; - case 23: - unpackAvx512.vectorUnpack23(data, offset, len); - break; - case 24: - unpackAvx512.vectorUnpack24(data, offset, len); - break; - case 26: - unpackAvx512.vectorUnpack26(data, offset, len); - break; - case 28: - unpackAvx512.vectorUnpack28(data, offset, len); - break; - case 30: - unpackAvx512.vectorUnpack30(data, offset, len); - break; - case 32: - unpackAvx512.vectorUnpack32(data, offset, len); - break; - case 40: - unpackDefault.unrolledUnpack40(data, offset, len); - break; - case 48: - unpackDefault.unrolledUnpack48(data, offset, len); - break; - case 56: - unpackDefault.unrolledUnpack56(data, offset, len); - break; - case 64: - unpackDefault.unrolledUnpack64(data, offset, len); - break; - default: - // Fallback to the default implementation for deprecated bit size. - unpackAvx512.plainUnpackLongs(data, offset, len, fbs, startBit); - break; - } - } else { - switch (fbs) { - case 4: - unpackDefault.unrolledUnpack4(data, offset, len); - break; - case 8: - unpackDefault.unrolledUnpack8(data, offset, len); - break; - case 16: - unpackDefault.unrolledUnpack16(data, offset, len); - break; - case 24: - unpackDefault.unrolledUnpack24(data, offset, len); - break; - case 32: - unpackDefault.unrolledUnpack32(data, offset, len); - break; - case 40: - unpackDefault.unrolledUnpack40(data, offset, len); - break; - case 48: - unpackDefault.unrolledUnpack48(data, offset, len); - break; - case 56: - unpackDefault.unrolledUnpack56(data, offset, len); - break; - case 64: - unpackDefault.unrolledUnpack64(data, offset, len); - break; - default: - // Fallback to the default implementation for deprecated bit size. - unpackDefault.plainUnpackLongs(data, offset, len, fbs); - break; - } - } - - return 0; - } -#endif - -} // namespace orc diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh index 4a857be8b5..cb95ae2841 100644 --- a/c++/src/Bpacking.hh +++ b/c++/src/Bpacking.hh @@ -24,14 +24,10 @@ #include "RLEv2.hh" namespace orc { - int readLongsDefault(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs); - -#if defined(ORC_HAVE_RUNTIME_AVX512) - int readLongsAvx512(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs); -#endif - + class BitUnpack { + public: + int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + }; } // namespace orc #endif diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index e4c04701f4..85fcf8592a 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -16,8 +16,6 @@ * limitations under the License. */ -#if defined(ORC_HAVE_RUNTIME_AVX512) - #include "BpackingAvx512.hh" #include "BitUnpackerAvx512.hh" #include "Utils.hh" @@ -4313,6 +4311,151 @@ namespace orc { } } -} // namespace orc + int BitUnpackAVX512::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs) { + UnpackAvx512 unpackAvx512(decoder); + UnpackDefault unpackDefault(decoder); + uint64_t startBit = 0; + static const auto cpu_info = CpuInfo::getInstance(); + if (cpu_info->isSupported(CpuInfo::AVX512)) { + switch (fbs) { + case 1: + unpackAvx512.vectorUnpack1(data, offset, len); + break; + case 2: + unpackAvx512.vectorUnpack2(data, offset, len); + break; + case 3: + unpackAvx512.vectorUnpack3(data, offset, len); + break; + case 4: + unpackAvx512.vectorUnpack4(data, offset, len); + break; + case 5: + unpackAvx512.vectorUnpack5(data, offset, len); + break; + case 6: + unpackAvx512.vectorUnpack6(data, offset, len); + break; + case 7: + unpackAvx512.vectorUnpack7(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 9: + unpackAvx512.vectorUnpack9(data, offset, len); + break; + case 10: + unpackAvx512.vectorUnpack10(data, offset, len); + break; + case 11: + unpackAvx512.vectorUnpack11(data, offset, len); + break; + case 12: + unpackAvx512.vectorUnpack12(data, offset, len); + break; + case 13: + unpackAvx512.vectorUnpack13(data, offset, len); + break; + case 14: + unpackAvx512.vectorUnpack14(data, offset, len); + break; + case 15: + unpackAvx512.vectorUnpack15(data, offset, len); + break; + case 16: + unpackAvx512.vectorUnpack16(data, offset, len); + break; + case 17: + unpackAvx512.vectorUnpack17(data, offset, len); + break; + case 18: + unpackAvx512.vectorUnpack18(data, offset, len); + break; + case 19: + unpackAvx512.vectorUnpack19(data, offset, len); + break; + case 20: + unpackAvx512.vectorUnpack20(data, offset, len); + break; + case 21: + unpackAvx512.vectorUnpack21(data, offset, len); + break; + case 22: + unpackAvx512.vectorUnpack22(data, offset, len); + break; + case 23: + unpackAvx512.vectorUnpack23(data, offset, len); + break; + case 24: + unpackAvx512.vectorUnpack24(data, offset, len); + break; + case 26: + unpackAvx512.vectorUnpack26(data, offset, len); + break; + case 28: + unpackAvx512.vectorUnpack28(data, offset, len); + break; + case 30: + unpackAvx512.vectorUnpack30(data, offset, len); + break; + case 32: + unpackAvx512.vectorUnpack32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackAvx512.plainUnpackLongs(data, offset, len, fbs, startBit); + break; + } + } else { + switch (fbs) { + case 4: + unpackDefault.unrolledUnpack4(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 16: + unpackDefault.unrolledUnpack16(data, offset, len); + break; + case 24: + unpackDefault.unrolledUnpack24(data, offset, len); + break; + case 32: + unpackDefault.unrolledUnpack32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackDefault.plainUnpackLongs(data, offset, len, fbs); + break; + } + } -#endif // #if defined(ORC_HAVE_RUNTIME_AVX512) + return 0; + } +} // namespace orc diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index 1ed3d5eca4..db974e59b5 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -19,8 +19,6 @@ #ifndef ORC_BPACKINGAVX512_HH #define ORC_BPACKINGAVX512_HH -#if defined(ORC_HAVE_RUNTIME_AVX512) - #include #include @@ -84,7 +82,11 @@ namespace orc { uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + 1]; }; + class BitUnpackAVX512 : public BitUnpack { + public: + static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + }; + } // namespace orc -#endif // #if defined(ORC_HAVE_RUNTIME_AVX512) #endif diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc index 19ec41fefb..4afce3467f 100644 --- a/c++/src/BpackingDefault.cc +++ b/c++/src/BpackingDefault.cc @@ -326,4 +326,43 @@ namespace orc { } } + int BitUnpackDefault::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs) { + UnpackDefault unpackDefault(decoder); + switch (fbs) { + case 4: + unpackDefault.unrolledUnpack4(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 16: + unpackDefault.unrolledUnpack16(data, offset, len); + break; + case 24: + unpackDefault.unrolledUnpack24(data, offset, len); + break; + case 32: + unpackDefault.unrolledUnpack32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackDefault.plainUnpackLongs(data, offset, len, fbs); + break; + } + return 0; + } + } // namespace orc diff --git a/c++/src/BpackingDefault.hh b/c++/src/BpackingDefault.hh index 89dcc24b1b..ab792f6602 100644 --- a/c++/src/BpackingDefault.hh +++ b/c++/src/BpackingDefault.hh @@ -22,6 +22,7 @@ #include #include +#include "Bpacking.hh" #include "RLEv2.hh" #include "io/InputStream.hh" #include "io/OutputStream.hh" @@ -49,6 +50,11 @@ namespace orc { RleDecoderV2* decoder; }; + class BitUnpackDefault : public BitUnpack { + public: + static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + }; + } // namespace orc #endif diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index 5dc217c384..7ee8751d8f 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -186,15 +186,19 @@ set(SOURCE_FILES Vector.cc Writer.cc CpuInfoUtil.cc - BpackingDefault.cc - BpackingAvx512.cc - Bpacking.cc) + BpackingDefault.cc) if(BUILD_LIBHDFSPP) set(SOURCE_FILES ${SOURCE_FILES} OrcHdfsFile.cc) add_definitions(-DBUILD_LIBHDFSPP) endif(BUILD_LIBHDFSPP) +if(BUILD_ENABLE_AVX512) + set(SOURCE_FILES + ${SOURCE_FILES} + BpackingAvx512.cc) +endif(BUILD_ENABLE_AVX512) + add_library (orc STATIC ${SOURCE_FILES}) target_link_libraries (orc diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index 954675473c..8812ca2317 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -283,7 +283,6 @@ namespace orc { } } - // TODO: vendor, model_name *vendor = CpuInfo::Vendor::Unknown; *model_name = "Unknown"; } diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 516f18d6c4..779cfa00b6 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -17,15 +17,16 @@ */ #include "Adaptor.hh" -#include "Bpacking.hh" +//#include "Bpacking.hh" +#include "BpackingDefault.hh" +#if defined(ORC_HAVE_RUNTIME_AVX512) +#include "BpackingAvx512.hh" +#endif #include "Compression.hh" #include "Dispatch.hh" #include "RLEV2Util.hh" #include "RLEv2.hh" #include "Utils.hh" -#if defined(ORC_HAVE_RUNTIME_AVX512) -#include "BpackingAvx512.hh" -#endif namespace orc { @@ -72,13 +73,13 @@ namespace orc { } struct UnpackDynamicFunction { - using FunctionType = decltype(&readLongsDefault); + using FunctionType = decltype(&BitUnpackDefault::readLongs); static std::vector> implementations() { #if defined(ORC_HAVE_RUNTIME_AVX512) - return {{DispatchLevel::NONE, readLongsDefault}, {DispatchLevel::AVX512, readLongsAvx512}}; + return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}, {DispatchLevel::AVX512, BitUnpackAVX512::readLongs}}; #else - return {{DispatchLevel::NONE, readLongsDefault}}; + return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}}; #endif } }; diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index 4fd0f70fdc..6698e5d49a 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -18,6 +18,10 @@ include_directories( set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") +if(BUILD_ENABLE_AVX512) + set(SIMD_TEST TestRleVectorDecoder.cc) +endif(BUILD_ENABLE_AVX512) + add_executable (orc-test MemoryInputStream.cc MemoryOutputStream.cc @@ -42,7 +46,6 @@ add_executable (orc-test TestReader.cc TestRleDecoder.cc TestRleEncoder.cc - TestRleVectorDecoder.cc TestRLEV2Util.cc TestSargsApplier.cc TestSearchArgument.cc @@ -51,6 +54,7 @@ add_executable (orc-test TestTimezone.cc TestType.cc TestWriter.cc + ${SIMD_TEST} ) target_link_libraries (orc-test diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index bcc05d9d7a..7b99c09548 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -28,7 +28,6 @@ DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") #endif namespace orc { -#if defined(ORC_HAVE_RUNTIME_AVX512) using ::testing::TestWithParam; using ::testing::Values; @@ -559,5 +558,4 @@ namespace orc { } INSTANTIATE_TEST_SUITE_P(OrcTest, RleV2BitUnpackAvx512Test, Values(true, false)); -#endif } // namespace orc From 5b0e66d158d46bfb6ed00b9a355c302b37cc9dc2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 16 Mar 2023 10:30:46 -0400 Subject: [PATCH 47/80] Modified the code format. --- c++/src/Bpacking.hh | 3 ++- c++/src/BpackingAvx512.cc | 4 ++-- c++/src/BpackingAvx512.hh | 3 ++- c++/src/BpackingDefault.cc | 4 ++-- c++/src/BpackingDefault.hh | 3 ++- c++/src/RleDecoderV2.cc | 7 ++++--- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh index cb95ae2841..6bcd2c9565 100644 --- a/c++/src/Bpacking.hh +++ b/c++/src/Bpacking.hh @@ -26,7 +26,8 @@ namespace orc { class BitUnpack { public: - int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); }; } // namespace orc diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index 85fcf8592a..fef458b396 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -4311,8 +4311,8 @@ namespace orc { } } - int BitUnpackAVX512::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs) { + int BitUnpackAVX512::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, + uint64_t len, uint64_t fbs) { UnpackAvx512 unpackAvx512(decoder); UnpackDefault unpackDefault(decoder); uint64_t startBit = 0; diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index db974e59b5..affe62723a 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -84,7 +84,8 @@ namespace orc { class BitUnpackAVX512 : public BitUnpack { public: - static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); }; } // namespace orc diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc index 4afce3467f..8a587a52ff 100644 --- a/c++/src/BpackingDefault.cc +++ b/c++/src/BpackingDefault.cc @@ -326,8 +326,8 @@ namespace orc { } } - int BitUnpackDefault::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs) { + int BitUnpackDefault::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, + uint64_t len, uint64_t fbs) { UnpackDefault unpackDefault(decoder); switch (fbs) { case 4: diff --git a/c++/src/BpackingDefault.hh b/c++/src/BpackingDefault.hh index ab792f6602..46663c135f 100644 --- a/c++/src/BpackingDefault.hh +++ b/c++/src/BpackingDefault.hh @@ -52,7 +52,8 @@ namespace orc { class BitUnpackDefault : public BitUnpack { public: - static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); }; } // namespace orc diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 779cfa00b6..f05ff5656a 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -17,7 +17,7 @@ */ #include "Adaptor.hh" -//#include "Bpacking.hh" +// #include "Bpacking.hh" #include "BpackingDefault.hh" #if defined(ORC_HAVE_RUNTIME_AVX512) #include "BpackingAvx512.hh" @@ -73,11 +73,12 @@ namespace orc { } struct UnpackDynamicFunction { - using FunctionType = decltype(&BitUnpackDefault::readLongs); + using FunctionType = decltype(&BitUnpack::readLongs); static std::vector> implementations() { #if defined(ORC_HAVE_RUNTIME_AVX512) - return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}, {DispatchLevel::AVX512, BitUnpackAVX512::readLongs}}; + return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}, + {DispatchLevel::AVX512, BitUnpackAVX512::readLongs}}; #else return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}}; #endif From 8c99fcd68df8d3ee076cc70718903d6d647856be Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 16 Mar 2023 16:05:29 -0400 Subject: [PATCH 48/80] 1.Delete the redundancy code in CpuInfo file 2.change stdint.h to cstdint --- c++/src/BitUnpackerAvx512.hh | 1 + c++/src/BpackingAvx512.hh | 2 +- c++/src/BpackingDefault.hh | 2 +- c++/src/CpuInfoUtil.cc | 44 ++---------------------------------- c++/src/CpuInfoUtil.hh | 10 +++++--- 5 files changed, 12 insertions(+), 47 deletions(-) diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh index 1e524e4e98..67f362a2f7 100644 --- a/c++/src/BitUnpackerAvx512.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -28,6 +28,7 @@ #endif #include +#include #include namespace orc { diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index affe62723a..9f45167f91 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -19,8 +19,8 @@ #ifndef ORC_BPACKINGAVX512_HH #define ORC_BPACKINGAVX512_HH -#include #include +#include #include "BpackingDefault.hh" #include "Dispatch.hh" diff --git a/c++/src/BpackingDefault.hh b/c++/src/BpackingDefault.hh index 46663c135f..dc7fde264e 100644 --- a/c++/src/BpackingDefault.hh +++ b/c++/src/BpackingDefault.hh @@ -19,8 +19,8 @@ #ifndef ORC_BPACKINGDEFAULT_HH #define ORC_BPACKINGDEFAULT_HH -#include #include +#include #include "Bpacking.hh" #include "RLEv2.hh" diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index 8812ca2317..c7daa57c62 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -35,13 +35,9 @@ #include #include #include -#include -#include #include #include -#include #include -#include #include #include #include @@ -49,15 +45,9 @@ #include "orc/Exceptions.hh" #undef CPUINFO_ARCH_X86 -#undef CPUINFO_ARCH_ARM -#undef CPUINFO_ARCH_PPC #if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) #define CPUINFO_ARCH_X86 -#elif defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) -#define CPUINFO_ARCH_ARM -#elif defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) -#define CPUINFO_ARCH_PPC #endif #ifndef ORC_HAVE_RUNTIME_AVX512 @@ -208,13 +198,6 @@ namespace orc { } } } -#elif defined(CPUINFO_ARCH_ARM) - // Windows on Arm - void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, - std::string* model_name) { - *hardware_flags |= CpuInfo::ASIMD; - // TODO: vendor, model_name - } #endif #elif defined(__APPLE__) @@ -271,9 +254,6 @@ namespace orc { {"hw.optional.avx512dq", CpuInfo::AVX512DQ}, {"hw.optional.avx512bw", CpuInfo::AVX512BW}, {"hw.optional.avx512vl", CpuInfo::AVX512VL}, -#elif defined(CPUINFO_ARCH_ARM) - // ARM64 (note that this is exposed under Rosetta as well) - {"hw.optional.neon", CpuInfo::ASIMD}, #endif }; for (const auto& feature : features) { @@ -283,6 +263,7 @@ namespace orc { } } + // TODO: vendor, model_name *vendor = CpuInfo::Vendor::Unknown; *model_name = "Unknown"; } @@ -358,8 +339,6 @@ namespace orc { {"avx512bw", CpuInfo::AVX512BW}, {"bmi1", CpuInfo::BMI1}, {"bmi2", CpuInfo::BMI2}, -#elif defined(CPUINFO_ARCH_ARM) - {"asimd", CpuInfo::ASIMD}, #endif }; const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); @@ -401,7 +380,6 @@ namespace orc { } // Read from /proc/cpuinfo - // TODO: vendor, model_name for Arm void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, std::string* model_name) { std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in); @@ -466,23 +444,7 @@ namespace orc { #endif } -#elif defined(CPUINFO_ARCH_ARM) - //------------------------------ AARCH64 ------------------------------// - bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { - if (simd_level == "NONE") { - *hardware_flags &= ~CpuInfo::ASIMD; - return true; - } - return false; - } - - void ArchVerifyCpuRequirements(const CpuInfo* ci) { - if (!ci->isDetected(CpuInfo::ASIMD)) { - throw ParseError("CPU does not support the Armv8 Neon instruction set"); - } - } - -#endif // X86, ARM +#endif // X86 } // namespace @@ -576,5 +538,3 @@ namespace orc { } // namespace orc #undef CPUINFO_ARCH_X86 -#undef CPUINFO_ARCH_ARM -#undef CPUINFO_ARCH_PPC diff --git a/c++/src/CpuInfoUtil.hh b/c++/src/CpuInfoUtil.hh index d2c2a8d7b1..656b3707bf 100644 --- a/c++/src/CpuInfoUtil.hh +++ b/c++/src/CpuInfoUtil.hh @@ -16,6 +16,13 @@ * limitations under the License. */ +/** + * @file CpuInfoUtil.hh code borrowing from + * https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/cpu_info.h + * @file CpuInfoUtil.cc code borrowing from + * https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/cpu_info.cc + */ + #ifndef ORC_CPUINFOUTIL_HH #define ORC_CPUINFOUTIL_HH @@ -51,9 +58,6 @@ namespace orc { static constexpr int64_t BMI1 = (1LL << 11); static constexpr int64_t BMI2 = (1LL << 12); - // Arm features - static constexpr int64_t ASIMD = (1LL << 32); - // Cache enums for L1 (data), L2 and L3 enum class CacheLevel { L1 = 0, L2, L3, Last = L3 }; From 0f1adda866e133b44c13ffecbf512dcccb93bd38 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 16 Mar 2023 21:23:47 -0400 Subject: [PATCH 49/80] Add the cpu flags print on windows. --- cmake_modules/ConfigSimdLevel.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 0d6ad35eb4..85d1bbbe41 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -63,7 +63,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) endif() - if(CXX_SUPPORTS_AVX512 AND NOT MSVC) + if(CXX_SUPPORTS_AVX512) execute_process(COMMAND grep flags /proc/cpuinfo COMMAND head -1 OUTPUT_VARIABLE flags_ver) From 21de59a6338b06e0ec0b0846e4191e547b394b88 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 17 Mar 2023 09:19:52 -0400 Subject: [PATCH 50/80] 1. Code format change in c++/src/Bpacking.hh 2. Add header in c++/src/CpuInfoUtil.cc --- c++/src/Bpacking.hh | 2 +- c++/src/CpuInfoUtil.cc | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh index 6bcd2c9565..85e3115d8e 100644 --- a/c++/src/Bpacking.hh +++ b/c++/src/Bpacking.hh @@ -27,7 +27,7 @@ namespace orc { class BitUnpack { public: static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs); + uint64_t fbs); }; } // namespace orc diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index c7daa57c62..993836b9ff 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -39,6 +39,7 @@ #include #include #include +#include #include #include From ae0d5c238c7d7d76a96b6b3209bc8c9beafb11ae Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 17 Mar 2023 10:29:19 -0400 Subject: [PATCH 51/80] Code format change about c++/src/CpuInfoUtil.cc --- c++/src/CpuInfoUtil.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index 993836b9ff..f70078bda6 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -38,8 +38,8 @@ #include #include #include -#include #include +#include #include #include From 070ca0fc4d594d862a6fd4c0ddbc7b4dd8a6b3ec Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 17 Mar 2023 13:02:55 +0800 Subject: [PATCH 52/80] Update cmake_modules/ConfigSimdLevel.cmake --- cmake_modules/ConfigSimdLevel.cmake | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 85d1bbbe41..5caf51e61c 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -1,14 +1,19 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. INCLUDE(CheckCXXCompilerFlag) message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") From 1fdfe54485f848357806c3e9c6c0edc7d8bed826 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 20 Mar 2023 10:22:04 -0400 Subject: [PATCH 53/80] 1. Deleted some useless header files included in source file 2. Modified some functions names 3. Added check_cxx_compiler_flag in cmake file to check CPU has AVX512 flags or not. --- .github/workflows/build_and_test.yml | 4 ++-- c++/src/BitUnpackerAvx512.hh | 2 +- c++/src/Bpacking.hh | 2 -- c++/src/BpackingAvx512.cc | 4 ++-- c++/src/BpackingAvx512.hh | 4 ---- c++/src/BpackingDefault.hh | 5 ++--- c++/src/CMakeLists.txt | 6 +++--- c++/src/RleDecoderV2.cc | 1 - c++/test/CMakeLists.txt | 4 ++-- c++/test/TestRleVectorDecoder.cc | 4 ++-- cmake_modules/ConfigSimdLevel.cmake | 8 +++++--- 11 files changed, 19 insertions(+), 25 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 019351fb72..8343931cb8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -102,7 +102,7 @@ jobs: cxx: - clang++ env: - ORC_USER_SIMD_LEVEL: avx512 + ORC_USER_SIMD_LEVEL: AVX512 steps: - name: Checkout uses: actions/checkout@v2 @@ -118,7 +118,7 @@ jobs: name: "SIMD programming using C++ intrinsic functions on Windows" runs-on: windows-2019 env: - ORC_USER_SIMD_LEVEL: avx512 + ORC_USER_SIMD_LEVEL: AVX512 steps: - name: Checkout uses: actions/checkout@v2 diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh index 67f362a2f7..63017edaef 100644 --- a/c++/src/BitUnpackerAvx512.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -461,7 +461,7 @@ namespace orc { 0x0405060700010203, 0x0C0D0E0F08090A0B, 0x1415161710111213, 0x1C1D1E1F18191A1B, 0x2425262720212223, 0x2C2D2E2F28292A2B, 0x3435363730313233, 0x3C3D3E3F38393A3B}; - uint32_t getAlign(uint32_t start_bit, uint32_t base, uint32_t bitsize) { + inline uint32_t getAlign(uint32_t start_bit, uint32_t base, uint32_t bitsize) { uint32_t remnant = bitsize - start_bit; uint32_t ret_value = 0xFFFFFFFF; for (uint32_t i = 0u; i < bitsize; ++i) { diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh index 85e3115d8e..41cb595e7f 100644 --- a/c++/src/Bpacking.hh +++ b/c++/src/Bpacking.hh @@ -21,8 +21,6 @@ #include -#include "RLEv2.hh" - namespace orc { class BitUnpack { public: diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index fef458b396..74e72795f7 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -16,9 +16,9 @@ * limitations under the License. */ -#include "BpackingAvx512.hh" #include "BitUnpackerAvx512.hh" -#include "Utils.hh" +#include "BpackingAvx512.hh" +#include "CpuInfoUtil.hh" namespace orc { UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), unpackDefault(UnpackDefault(dec)) { diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index 9f45167f91..ed366fab1a 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -23,10 +23,6 @@ #include #include "BpackingDefault.hh" -#include "Dispatch.hh" -#include "RLEv2.hh" -#include "io/InputStream.hh" -#include "io/OutputStream.hh" namespace orc { diff --git a/c++/src/BpackingDefault.hh b/c++/src/BpackingDefault.hh index dc7fde264e..00e11169d6 100644 --- a/c++/src/BpackingDefault.hh +++ b/c++/src/BpackingDefault.hh @@ -22,10 +22,9 @@ #include #include -#include "Bpacking.hh" #include "RLEv2.hh" -#include "io/InputStream.hh" -#include "io/OutputStream.hh" + +#include "Bpacking.hh" namespace orc { diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index 7ee8751d8f..a2f7be3b62 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -161,12 +161,14 @@ set(SOURCE_FILES Adaptor.cc BlockBuffer.cc BloomFilter.cc + BpackingDefault.cc ByteRLE.cc ColumnPrinter.cc ColumnReader.cc ColumnWriter.cc Common.cc Compression.cc + CpuInfoUtil.cc Exceptions.cc Int128.cc LzoDecompressor.cc @@ -184,9 +186,7 @@ set(SOURCE_FILES Timezone.cc TypeImpl.cc Vector.cc - Writer.cc - CpuInfoUtil.cc - BpackingDefault.cc) + Writer.cc) if(BUILD_LIBHDFSPP) set(SOURCE_FILES ${SOURCE_FILES} OrcHdfsFile.cc) diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index f05ff5656a..b3a8349cf4 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -17,7 +17,6 @@ */ #include "Adaptor.hh" -// #include "Bpacking.hh" #include "BpackingDefault.hh" #if defined(ORC_HAVE_RUNTIME_AVX512) #include "BpackingAvx512.hh" diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index 6698e5d49a..31ecc96819 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -19,7 +19,7 @@ include_directories( set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") if(BUILD_ENABLE_AVX512) - set(SIMD_TEST TestRleVectorDecoder.cc) + set(SIMD_TEST_SRCS TestRleVectorDecoder.cc) endif(BUILD_ENABLE_AVX512) add_executable (orc-test @@ -54,7 +54,7 @@ add_executable (orc-test TestTimezone.cc TestType.cc TestWriter.cc - ${SIMD_TEST} + ${SIMD_TEST_SRCS} ) target_link_libraries (orc-test diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc index 7b99c09548..352b883485 100644 --- a/c++/test/TestRleVectorDecoder.cc +++ b/c++/test/TestRleVectorDecoder.cc @@ -74,7 +74,7 @@ namespace orc { alignBitpacking = GetParam(); } - void generateDataFolBits(uint64_t numValues, int64_t start, int64_t delta, bool random, + void generateDataForBits(uint64_t numValues, int64_t start, int64_t delta, bool random, int64_t* data, uint8_t bitWidth, uint64_t numNulls = 0, char* notNull = nullptr) { int64_t max = pow(2, bitWidth); @@ -141,7 +141,7 @@ namespace orc { char* notNull = numNulls == 0 ? nullptr : new char[numValues]; int64_t* data = new int64_t[numValues]; - generateDataFolBits(numValues, start, delta, random, data, bitWidth, numNulls, notNull); + generateDataForBits(numValues, start, delta, random, data, bitWidth, numNulls, notNull); encoder->add(data, numValues, notNull); encoder->flush(); diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 5caf51e61c..71974e1969 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -37,11 +37,13 @@ if(ORC_CPU_FLAG STREQUAL "x86") # x86/amd64 compiler flags, msvc/gcc/clang if(MSVC) set(ORC_AVX512_FLAG "/arch:AVX512") + check_cxx_compiler_flag("/arch:AVX512" COMPILER_SUPPORT_AVX512) else() # "arch=native" selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. # Using -march=native enables all instruction subsets supported by the local machine. # Using -mtune=native produces code optimized for the local machine under the constraints of the selected instruction set. set(ORC_AVX512_FLAG "-march=native -mtune=native") + check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) endif() if(MINGW) @@ -60,8 +62,8 @@ if(ORC_CPU_FLAG STREQUAL "x86") int main() { __m512i mask = _mm512_set1_epi32(0x1); - char out[32]; - _mm512_storeu_si512(out, mask); + char out[32]; + _mm512_storeu_si512(out, mask); return 0; }" CXX_SUPPORTS_AVX512) @@ -76,7 +78,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") endif() # Runtime SIMD level it can get from compiler - if(CXX_SUPPORTS_AVX512) + if(CXX_SUPPORTS_AVX512 AND COMPILER_SUPPORT_AVX512) message(STATUS "Enabled the AVX512 for RLE bit-unpacking") set(ORC_HAVE_RUNTIME_AVX512 ON) set(ORC_SIMD_LEVEL "AVX512") From 3f156b4e952fac47ea15abe3236b031886c9fee5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 20 Mar 2023 13:56:55 -0400 Subject: [PATCH 54/80] 1. Code format about c++/src/BpackingAvx512.cc 2. Changed the compiler flags judgement on windows --- c++/src/BpackingAvx512.cc | 2 +- cmake_modules/ConfigSimdLevel.cmake | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index 74e72795f7..b30ccc5701 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -16,8 +16,8 @@ * limitations under the License. */ -#include "BitUnpackerAvx512.hh" #include "BpackingAvx512.hh" +#include "BitUnpackerAvx512.hh" #include "CpuInfoUtil.hh" namespace orc { diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 71974e1969..bc7e6465c1 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -37,14 +37,13 @@ if(ORC_CPU_FLAG STREQUAL "x86") # x86/amd64 compiler flags, msvc/gcc/clang if(MSVC) set(ORC_AVX512_FLAG "/arch:AVX512") - check_cxx_compiler_flag("/arch:AVX512" COMPILER_SUPPORT_AVX512) else() # "arch=native" selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. # Using -march=native enables all instruction subsets supported by the local machine. # Using -mtune=native produces code optimized for the local machine under the constraints of the selected instruction set. set(ORC_AVX512_FLAG "-march=native -mtune=native") - check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) endif() + check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) if(MINGW) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 From 4b166eecb1c97e2eb3ccb869c8b66fc101860fe5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 22 Mar 2023 14:53:01 -0400 Subject: [PATCH 55/80] 1. Delete the redundant buffer array in class UnpackAvx512 2. Add the forward declaration of RleDecoderV2 3. Modified the comments in CpuInfo file --- c++/src/Bpacking.hh | 2 + c++/src/BpackingAvx512.cc | 353 +++++++++++++++++++------------------ c++/src/BpackingAvx512.hh | 18 +- c++/src/BpackingDefault.cc | 1 + c++/src/BpackingDefault.hh | 5 +- c++/src/CpuInfoUtil.cc | 4 + c++/src/CpuInfoUtil.hh | 5 +- 7 files changed, 203 insertions(+), 185 deletions(-) diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh index 41cb595e7f..898376b019 100644 --- a/c++/src/Bpacking.hh +++ b/c++/src/Bpacking.hh @@ -22,6 +22,8 @@ #include namespace orc { + class RleDecoderV2; + class BitUnpack { public: static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index b30ccc5701..4c0f3ec7aa 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -19,6 +19,7 @@ #include "BpackingAvx512.hh" #include "BitUnpackerAvx512.hh" #include "CpuInfoUtil.hh" +#include "RLEv2.hh" namespace orc { UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), unpackDefault(UnpackDefault(dec)) { @@ -91,25 +92,27 @@ namespace orc { } } - if (numElements >= 64) { + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); __m512i reverseMask1u = _mm512_loadu_si512(reverseMaskTable1u); - while (numElements >= 64) { + + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint64_t src_64 = *reinterpret_cast(const_cast(srcPtr)); // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF __m512i srcmm = _mm512_movm_epi8(src_64); // make 0x00 --> 0x00, 0xFF --> 0x01 srcmm = _mm512_abs_epi8(srcmm); srcmm = _mm512_shuffle_epi8(srcmm, reverseMask1u); - _mm512_storeu_si512(vectorBuf8, srcmm); + _mm512_storeu_si512(simdPtr, srcmm); srcPtr += 8 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; } } @@ -212,10 +215,11 @@ namespace orc { } } - if (numElements >= 64) { + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); __mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements) __m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0 - while (numElements >= 64) { + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { __m512i srcmm3 = _mm512_maskz_loadu_epi8(readMask, srcPtr); __m512i srcmm0, srcmm1, srcmm2, tmpmm; @@ -244,16 +248,16 @@ namespace orc { srcmm0 = _mm512_and_si512(srcmm0, parse_mask); - _mm512_storeu_si512(vectorBuf8, srcmm0); + _mm512_storeu_si512(simdPtr, srcmm0); srcPtr += 8 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; } } @@ -356,7 +360,8 @@ namespace orc { } } - if (numElements >= 64) { + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -370,7 +375,7 @@ namespace orc { shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable3u_0); shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable3u_1); - while (numElements >= 64) { + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); @@ -388,16 +393,16 @@ namespace orc { zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); zmm[0] = _mm512_and_si512(zmm[0], parseMask); - _mm512_storeu_si512(vectorBuf8, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 8 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; } } @@ -500,10 +505,11 @@ namespace orc { } } - if (numElements >= 64) { + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); __mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements) __m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0 - while (numElements >= 64) { + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { __m512i srcmm0, srcmm1, tmpmm; srcmm1 = _mm512_maskz_loadu_epi8(readMask, srcPtr); @@ -520,16 +526,16 @@ namespace orc { // turn 4 bitWidth into 8 by zeroing 4 of each 8 bits. srcmm0 = _mm512_and_si512(srcmm0, parseMask); - _mm512_storeu_si512(vectorBuf8, srcmm0); + _mm512_storeu_si512(simdPtr, srcmm0); srcPtr += 8 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; } } @@ -632,7 +638,8 @@ namespace orc { } } - if (numElements >= 64) { + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -646,7 +653,7 @@ namespace orc { shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable5u_0); shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable5u_1); - while (numElements >= 64) { + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); @@ -664,16 +671,16 @@ namespace orc { zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); zmm[0] = _mm512_and_si512(zmm[0], parseMask); - _mm512_storeu_si512(vectorBuf8, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 8 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; } } @@ -776,7 +783,8 @@ namespace orc { } } - if (numElements >= 64) { + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -790,7 +798,7 @@ namespace orc { shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable6u_0); shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable6u_1); - while (numElements >= 64) { + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); @@ -808,16 +816,16 @@ namespace orc { zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); zmm[0] = _mm512_and_si512(zmm[0], parseMask); - _mm512_storeu_si512(vectorBuf8, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 8 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; } } @@ -920,7 +928,8 @@ namespace orc { } } - if (numElements >= 64) { + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -934,7 +943,7 @@ namespace orc { shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable7u_0); shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable7u_1); - while (numElements >= 64) { + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); @@ -952,16 +961,16 @@ namespace orc { zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); zmm[0] = _mm512_and_si512(zmm[0], parseMask); - _mm512_storeu_si512(vectorBuf8, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 8 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 8 * bitWidth; - numElements -= 64; - std::copy(vectorBuf8, vectorBuf8 + 64, dstPtr); - dstPtr += 64; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; } } @@ -1065,6 +1074,7 @@ namespace orc { } if (numElements >= 32) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -1095,16 +1105,16 @@ namespace orc { zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[2]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf16, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } if (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -1146,16 +1156,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); - _mm512_storeu_si512(vectorBuf16, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } } @@ -1259,6 +1269,7 @@ namespace orc { } if (numElements >= 32) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -1278,16 +1289,16 @@ namespace orc { zmm = _mm512_srlv_epi16(zmm, shiftMask); zmm = _mm512_and_si512(zmm, parseMask0); - _mm512_storeu_si512(vectorBuf16, zmm); + _mm512_storeu_si512(simdPtr, zmm); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } } @@ -1391,6 +1402,7 @@ namespace orc { } if (numElements >= 32) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -1430,16 +1442,16 @@ namespace orc { zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf16, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } if (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -1481,16 +1493,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); - _mm512_storeu_si512(vectorBuf16, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } } @@ -1594,6 +1606,7 @@ namespace orc { } if (numElements >= 32) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -1613,7 +1626,7 @@ namespace orc { zmm = _mm512_srlv_epi16(zmm, shiftMask); zmm = _mm512_and_si512(zmm, parseMask0); - _mm512_storeu_si512(vectorBuf16, zmm); + _mm512_storeu_si512(simdPtr, zmm); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, @@ -1621,7 +1634,7 @@ namespace orc { bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); + std::copy(simdPtr, simdPtr + 32, dstPtr); dstPtr += 32; } } @@ -1726,6 +1739,7 @@ namespace orc { } if (numElements >= 32) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -1765,16 +1779,16 @@ namespace orc { zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf16, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } if (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -1816,16 +1830,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); - _mm512_storeu_si512(vectorBuf16, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } } @@ -1929,6 +1943,7 @@ namespace orc { } if (numElements >= 32) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -1960,16 +1975,16 @@ namespace orc { zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf16, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } } @@ -2073,6 +2088,7 @@ namespace orc { } if (numElements >= 32) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -2112,16 +2128,16 @@ namespace orc { zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf16, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } if (numElements >= 32) { __m512i srcmm, zmm[2]; @@ -2163,16 +2179,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); - _mm512_storeu_si512(vectorBuf16, zmm[0]); + _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } } @@ -2243,20 +2259,21 @@ namespace orc { } if (numElements >= 32) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); __m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u); while (numElements >= 32) { __m512i srcmm = _mm512_loadu_si512(srcPtr); srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u); - _mm512_storeu_si512(vectorBuf16, srcmm); + _mm512_storeu_si512(simdPtr, srcmm); srcPtr += 4 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(vectorBuf16, vectorBuf16 + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } } @@ -2386,16 +2403,16 @@ namespace orc { zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } if (numElements >= 16) { @@ -2437,16 +2454,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -2580,16 +2597,16 @@ namespace orc { zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } if (numElements >= 16) { @@ -2631,16 +2648,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -2774,16 +2791,16 @@ namespace orc { zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } if (numElements >= 16) { @@ -2825,16 +2842,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -2957,16 +2974,16 @@ namespace orc { zmm = _mm512_srlv_epi32(zmm, shiftMask); zmm = _mm512_and_si512(zmm, parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm); + _mm512_storeu_si512(vectorBuf, zmm); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -3100,16 +3117,16 @@ namespace orc { zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } if (numElements >= 16) { @@ -3151,16 +3168,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -3294,16 +3311,16 @@ namespace orc { zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } if (numElements >= 16) { @@ -3345,16 +3362,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -3489,16 +3506,16 @@ namespace orc { zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } if (numElements >= 16) { @@ -3540,16 +3557,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -3633,16 +3650,16 @@ namespace orc { zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); zmm = _mm512_shuffle_epi8(zmm, shuffleIdx); - _mm512_storeu_si512(vectorBuf32, zmm); + _mm512_storeu_si512(vectorBuf, zmm); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -3772,16 +3789,16 @@ namespace orc { zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } if (numElements >= 16) { @@ -3823,16 +3840,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -3955,16 +3972,16 @@ namespace orc { zmm = _mm512_srlv_epi32(zmm, shiftMask); zmm = _mm512_and_si512(zmm, parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm); + _mm512_storeu_si512(vectorBuf, zmm); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -4107,16 +4124,16 @@ namespace orc { zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); zmm[0] = _mm512_and_si512(zmm[0], parseMask0); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } if (numElements >= 16) { __m512i srcmm, zmm[2]; @@ -4157,16 +4174,16 @@ namespace orc { zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - _mm512_storeu_si512(vectorBuf32, zmm[0]); + _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } @@ -4241,16 +4258,16 @@ namespace orc { while (numElements >= 16) { __m512i srcmm = _mm512_loadu_si512(srcPtr); srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u); - _mm512_storeu_si512(vectorBuf32, srcmm); + _mm512_storeu_si512(vectorBuf, srcmm); srcPtr += 2 * bitWidth; decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 2 * bitWidth; - numElements -= 16; - std::copy(vectorBuf32, vectorBuf32 + 16, dstPtr); - dstPtr += 16; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } } diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index ed366fab1a..3d5b05aedc 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -19,16 +19,18 @@ #ifndef ORC_BPACKINGAVX512_HH #define ORC_BPACKINGAVX512_HH -#include #include +#include #include "BpackingDefault.hh" namespace orc { -#define MAX_VECTOR_BUF_8BIT_LENGTH 64 -#define MAX_VECTOR_BUF_16BIT_LENGTH 32 -#define MAX_VECTOR_BUF_32BIT_LENGTH 16 +#define VECTOR_UNPACK_8BIT_MAX_NUM 64 +#define VECTOR_UNPACK_16BIT_MAX_NUM 32 +#define VECTOR_UNPACK_32BIT_MAX_NUM 16 + + class RleDecoderV2; class UnpackAvx512 { public: @@ -70,12 +72,8 @@ namespace orc { RleDecoderV2* decoder; UnpackDefault unpackDefault; - // Used by vectorially 1~8 bit-unpacking data - uint8_t vectorBuf8[MAX_VECTOR_BUF_8BIT_LENGTH + 1]; - // Used by vectorially 9~16 bit-unpacking data - uint16_t vectorBuf16[MAX_VECTOR_BUF_16BIT_LENGTH + 1]; - // Used by vectorially 17~32 bit-unpacking data - uint32_t vectorBuf32[MAX_VECTOR_BUF_32BIT_LENGTH + 1]; + // Used by vectorially bit-unpacking data + uint32_t vectorBuf[VECTOR_UNPACK_32BIT_MAX_NUM + 1]; }; class BitUnpackAVX512 : public BitUnpack { diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc index 8a587a52ff..b937be77f0 100644 --- a/c++/src/BpackingDefault.cc +++ b/c++/src/BpackingDefault.cc @@ -17,6 +17,7 @@ */ #include "BpackingDefault.hh" +#include "RLEv2.hh" #include "Utils.hh" namespace orc { diff --git a/c++/src/BpackingDefault.hh b/c++/src/BpackingDefault.hh index 00e11169d6..f68812eda5 100644 --- a/c++/src/BpackingDefault.hh +++ b/c++/src/BpackingDefault.hh @@ -19,14 +19,13 @@ #ifndef ORC_BPACKINGDEFAULT_HH #define ORC_BPACKINGDEFAULT_HH -#include #include - -#include "RLEv2.hh" +#include #include "Bpacking.hh" namespace orc { + class RleDecoderV2; class UnpackDefault { public: diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc index f70078bda6..bf32617c43 100644 --- a/c++/src/CpuInfoUtil.cc +++ b/c++/src/CpuInfoUtil.cc @@ -16,6 +16,10 @@ * limitations under the License. */ +/** + * @file CpuInfoUtil.cc is from Apache Arrow as of 2023-03-21 + */ + #include "CpuInfoUtil.hh" #ifdef __APPLE__ diff --git a/c++/src/CpuInfoUtil.hh b/c++/src/CpuInfoUtil.hh index 656b3707bf..ad7df6a82e 100644 --- a/c++/src/CpuInfoUtil.hh +++ b/c++/src/CpuInfoUtil.hh @@ -17,10 +17,7 @@ */ /** - * @file CpuInfoUtil.hh code borrowing from - * https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/cpu_info.h - * @file CpuInfoUtil.cc code borrowing from - * https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/cpu_info.cc + * @file CpuInfoUtil.hh is from Apache Arrow as of 2023-03-21 */ #ifndef ORC_CPUINFOUTIL_HH From 3c21f2e178aff90c953e7a455ab716b82077417f Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Wed, 22 Mar 2023 15:18:36 +0800 Subject: [PATCH 56/80] Use macros to replace some number --- c++/src/BpackingAvx512.cc | 114 +++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index 4c0f3ec7aa..c1e372d8ad 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -1073,7 +1073,7 @@ namespace orc { } } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -1094,7 +1094,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable9u); - while (numElements >= 64) { + while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -1116,7 +1116,7 @@ namespace orc { std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -1268,7 +1268,7 @@ namespace orc { } } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -1277,7 +1277,7 @@ namespace orc { __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable10u); __m512i shiftMask = _mm512_loadu_si512(shiftTable10u); - while (numElements >= 32) { + while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -1401,7 +1401,7 @@ namespace orc { } } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -1425,7 +1425,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable11u); - while (numElements >= 64) { + while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -1453,7 +1453,7 @@ namespace orc { std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -1605,7 +1605,7 @@ namespace orc { } } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -1614,7 +1614,7 @@ namespace orc { __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable12u); __m512i shiftMask = _mm512_loadu_si512(shiftTable12u); - while (numElements >= 32) { + while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -1633,9 +1633,9 @@ namespace orc { 0); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; - numElements -= 32; - std::copy(simdPtr, simdPtr + 32, dstPtr); - dstPtr += 32; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } } @@ -1738,7 +1738,7 @@ namespace orc { } } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -1762,7 +1762,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable13u); - while (numElements >= 64) { + while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -1790,7 +1790,7 @@ namespace orc { std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -1942,7 +1942,7 @@ namespace orc { } } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -1957,7 +1957,7 @@ namespace orc { shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable14u_0); shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable14u_1); - while (numElements >= 32) { + while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -2087,7 +2087,7 @@ namespace orc { } } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -2111,7 +2111,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable15u); - while (numElements >= 64) { + while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -2139,7 +2139,7 @@ namespace orc { std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -2258,10 +2258,10 @@ namespace orc { tailBitLen = 0; } - if (numElements >= 32) { + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); __m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u); - while (numElements >= 32) { + while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { __m512i srcmm = _mm512_loadu_si512(srcPtr); srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u); _mm512_storeu_si512(simdPtr, srcmm); @@ -2372,7 +2372,7 @@ namespace orc { } } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -2392,7 +2392,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable17u); - while (numElements >= 32) { + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); @@ -2415,7 +2415,7 @@ namespace orc { dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -2566,7 +2566,7 @@ namespace orc { } } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -2586,7 +2586,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable18u); - while (numElements >= 32) { + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -2609,7 +2609,7 @@ namespace orc { dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); @@ -2760,7 +2760,7 @@ namespace orc { } } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -2780,7 +2780,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable19u); - while (numElements >= 32) { + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -2803,7 +2803,7 @@ namespace orc { dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -2938,7 +2938,7 @@ namespace orc { tailBitLen = 0; } - if (startBit > 0u) { + if (startBit > 0) { uint32_t align = getAlign(startBit, bitWidth, 32u); if (align > numElements) { align = numElements; @@ -2954,7 +2954,7 @@ namespace orc { } } - if (numElements >= 16u) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -2962,7 +2962,7 @@ namespace orc { __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable20u); __m512i shiftMask = _mm512_loadu_si512(shiftTable20u); - while (numElements >= 16u) { + while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm; srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); @@ -3070,7 +3070,7 @@ namespace orc { tailBitLen = 0; } - if (startBit > 0u) { + if (startBit > 0) { uint32_t align = getAlign(startBit, bitWidth, 32); if (align > numElements) { align = numElements; @@ -3086,7 +3086,7 @@ namespace orc { } } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -3106,7 +3106,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable21u); - while (numElements >= 32) { + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -3129,7 +3129,7 @@ namespace orc { dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -3280,7 +3280,7 @@ namespace orc { } } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -3300,7 +3300,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable22u); - while (numElements >= 32) { + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -3323,7 +3323,7 @@ namespace orc { dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); @@ -3475,7 +3475,7 @@ namespace orc { } } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -3495,7 +3495,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable23u); - while (numElements >= 32) { + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -3518,7 +3518,7 @@ namespace orc { dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); @@ -3636,13 +3636,13 @@ namespace orc { tailBitLen = 0; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i shuffleIdx = _mm512_loadu_si512(shuffleIdxTable24u_0); __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable24u); - while (numElements >= 16) { + while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm; srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); @@ -3758,7 +3758,7 @@ namespace orc { } } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -3778,7 +3778,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable26u); - while (numElements >= 32) { + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); @@ -3801,7 +3801,7 @@ namespace orc { dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); @@ -3952,7 +3952,7 @@ namespace orc { } } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); @@ -3960,7 +3960,7 @@ namespace orc { __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable28u); __m512i shiftMask = _mm512_loadu_si512(shiftTable28u); - while (numElements >= 16) { + while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm; srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); @@ -4084,7 +4084,7 @@ namespace orc { } } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); @@ -4107,7 +4107,7 @@ namespace orc { __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable30u); - while (numElements >= 32) { + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); @@ -4135,7 +4135,7 @@ namespace orc { std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm, zmm[2]; srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); @@ -4253,9 +4253,9 @@ namespace orc { tailBitLen = 0; } - if (numElements >= 16) { + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); - while (numElements >= 16) { + while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i srcmm = _mm512_loadu_si512(srcPtr); srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u); _mm512_storeu_si512(vectorBuf, srcmm); From 27d5b403fc8046f2be8eb0622b5c9842a7605050 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 24 Mar 2023 16:15:32 -0400 Subject: [PATCH 57/80] Change RleDecoderV2::readLongs return type back to void. --- c++/src/Bpacking.hh | 4 ++-- c++/src/BpackingAvx512.cc | 6 ++---- c++/src/BpackingAvx512.hh | 4 ++-- c++/src/BpackingDefault.cc | 5 ++--- c++/src/BpackingDefault.hh | 4 ++-- c++/src/RLEv2.hh | 2 +- c++/src/RleDecoderV2.cc | 2 +- 7 files changed, 12 insertions(+), 15 deletions(-) diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh index 898376b019..f55e986d8d 100644 --- a/c++/src/Bpacking.hh +++ b/c++/src/Bpacking.hh @@ -26,8 +26,8 @@ namespace orc { class BitUnpack { public: - static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs); + static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); }; } // namespace orc diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index c1e372d8ad..1a00477a9e 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -4328,8 +4328,8 @@ namespace orc { } } - int BitUnpackAVX512::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, - uint64_t len, uint64_t fbs) { + void BitUnpackAVX512::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, + uint64_t len, uint64_t fbs) { UnpackAvx512 unpackAvx512(decoder); UnpackDefault unpackDefault(decoder); uint64_t startBit = 0; @@ -4472,7 +4472,5 @@ namespace orc { break; } } - - return 0; } } // namespace orc diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index 3d5b05aedc..8da18f934e 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -78,8 +78,8 @@ namespace orc { class BitUnpackAVX512 : public BitUnpack { public: - static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs); + static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); }; } // namespace orc diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc index b937be77f0..1248a06afb 100644 --- a/c++/src/BpackingDefault.cc +++ b/c++/src/BpackingDefault.cc @@ -327,8 +327,8 @@ namespace orc { } } - int BitUnpackDefault::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, - uint64_t len, uint64_t fbs) { + void BitUnpackDefault::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, + uint64_t len, uint64_t fbs) { UnpackDefault unpackDefault(decoder); switch (fbs) { case 4: @@ -363,7 +363,6 @@ namespace orc { unpackDefault.plainUnpackLongs(data, offset, len, fbs); break; } - return 0; } } // namespace orc diff --git a/c++/src/BpackingDefault.hh b/c++/src/BpackingDefault.hh index f68812eda5..0a58234495 100644 --- a/c++/src/BpackingDefault.hh +++ b/c++/src/BpackingDefault.hh @@ -50,8 +50,8 @@ namespace orc { class BitUnpackDefault : public BitUnpack { public: - static int readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, - uint64_t fbs); + static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); }; } // namespace orc diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index de7fb30a02..a8ec3accc0 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -206,7 +206,7 @@ namespace orc { int64_t readLongBE(uint64_t bsz); int64_t readVslong(); uint64_t readVulong(); - int readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); template uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull); diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index b3a8349cf4..f1eff52076 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -84,7 +84,7 @@ namespace orc { } }; - int RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { static DynamicDispatch dispatch; return dispatch.func(this, data, offset, len, fbs); } From 7cea68ee3a7d1096235b9566728c2eec28be65db Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 27 Mar 2023 11:43:41 -0400 Subject: [PATCH 58/80] Added "how to build&use AVX512 in ORC" in README.md --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index f5216af83c..dd80e7b513 100644 --- a/README.md +++ b/README.md @@ -93,3 +93,15 @@ To build only the C++ library: % make test-out ``` + +To build the C++ library with AVX512 enabling: +```shell +ENV parameter ORC_USER_SIMD_LEVEL is to switch "AVX512" and "NONE" at the running time. +export ORC_USER_SIMD_LEVEL=AVX512 +% mkdir build +% cd build +% cmake .. -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON +% make package +% make test-out + +``` From 3be42ee644aa891a8bdc92c7bc67cdcb54fde01c Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 27 Mar 2023 15:32:54 -0400 Subject: [PATCH 59/80] 1.Modified the description about how to use AVX512 in README.md 2.Change the status of "AVX512 required but compiler doesn't support it" from fatal_error to warning. --- README.md | 7 ++++--- cmake_modules/ConfigSimdLevel.cmake | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index dd80e7b513..194c49511d 100644 --- a/README.md +++ b/README.md @@ -94,14 +94,15 @@ To build only the C++ library: ``` -To build the C++ library with AVX512 enabling: +To build the C++ library with AVX512 enabled: ```shell -ENV parameter ORC_USER_SIMD_LEVEL is to switch "AVX512" and "NONE" at the running time. +Cmake option BUILD_ENABLE_AVX512 can be set to "ON" or (default value)"OFF" at the compile time. At compile time, it defines the SIMD level(AVX512) to be compiled into the binaries. +Environment variable ORC_USER_SIMD_LEVEL can be set to "AVX512" or (default value)"NONE" at the run time. At run time, it defines the SIMD level to dispatch the code which can apply SIMD optimization. +Note that if ORC_USER_SIMD_LEVEL is set to "NONE" at run time, AVX512 will not take effect at run time even if BUILD_ENABLE_AVX512 is set to "ON" at compile time. export ORC_USER_SIMD_LEVEL=AVX512 % mkdir build % cd build % cmake .. -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON % make package % make test-out - ``` diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index bc7e6465c1..3f10794a9b 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -83,7 +83,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") set(ORC_SIMD_LEVEL "AVX512") add_definitions(-DORC_HAVE_RUNTIME_AVX512) else() - message(FATAL_ERROR "AVX512 required but compiler doesn't support it, failed to enable AVX512.") + message(STATUS "WARNING: AVX512 required but compiler doesn't support it, failed to enable AVX512.") set(ORC_HAVE_RUNTIME_AVX512 OFF) endif() if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") From 11ceeaa20ff793521d0b40514ccb074e82edbcb3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 27 Mar 2023 19:03:54 -0400 Subject: [PATCH 60/80] When compiler doesn't support AVX512, but customer set BUILD_ENABLE_AVX512=on, it will disable AVX512 compile, and reset BUILD_ENABLE_AVX512=off. --- cmake_modules/ConfigSimdLevel.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 3f10794a9b..411c5d47b4 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -79,12 +79,11 @@ if(ORC_CPU_FLAG STREQUAL "x86") # Runtime SIMD level it can get from compiler if(CXX_SUPPORTS_AVX512 AND COMPILER_SUPPORT_AVX512) message(STATUS "Enabled the AVX512 for RLE bit-unpacking") - set(ORC_HAVE_RUNTIME_AVX512 ON) set(ORC_SIMD_LEVEL "AVX512") add_definitions(-DORC_HAVE_RUNTIME_AVX512) else() message(STATUS "WARNING: AVX512 required but compiler doesn't support it, failed to enable AVX512.") - set(ORC_HAVE_RUNTIME_AVX512 OFF) + set(BUILD_ENABLE_AVX512 OFF) endif() if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") set(ORC_SIMD_LEVEL "NONE") From 277d9beec58be8dc9cdefd75d8cbb45d57cbd5a5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 28 Mar 2023 11:31:24 -0400 Subject: [PATCH 61/80] 1. Update link information about apple avx512 in CMakeLists.txt 2. Modified the CI test about avx512 on windows 3. Modified the COMPILER_SUPPORT_AVX512 on windows --- .github/workflows/build_and_test.yml | 36 +++++++++++----------------- CMakeLists.txt | 2 +- cmake_modules/ConfigSimdLevel.cmake | 7 ++++-- 3 files changed, 20 insertions(+), 25 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8343931cb8..133031919d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -74,8 +74,16 @@ jobs: cat /home/runner/work/orc/orc/build/java/rat.txt windows: - name: "Build on Windows" + name: "Build and ${{ matrix.simd }} Test on Windows" runs-on: windows-2019 + strategy: + fail-fast: false + matrix: + simd: + - General + - AVX512 + env: + ORC_USER_SIMD_LEVEL: AVX512 steps: - name: Checkout uses: actions/checkout@v2 @@ -87,7 +95,11 @@ jobs: run: | mkdir build cd build - cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF + if [ "${{ matrix.simd }}" = "General" ]; then + cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF + else + cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON + fi cmake --build . --config Debug ctest -C Debug --output-on-failure @@ -114,26 +126,6 @@ jobs: cmake -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON .. make package test-out - simdWindows: - name: "SIMD programming using C++ intrinsic functions on Windows" - runs-on: windows-2019 - env: - ORC_USER_SIMD_LEVEL: AVX512 - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1.1 - with: - msbuild-architecture: x64 - - name: "Test" - run: | - mkdir build - cd build - cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON - cmake --build . --config Debug - ctest -C Debug --output-on-failure - doc: name: "Javadoc generation" runs-on: ubuntu-20.04 diff --git a/CMakeLists.txt b/CMakeLists.txt index 69a6af4ab6..e6dbaf2d5f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -178,7 +178,7 @@ message(STATUS "BUILD_ENABLE_AVX512: ${BUILD_ENABLE_AVX512}") # macOS doesn't fully support AVX512, it has a different way dealing with AVX512 than Windows and Linux. # # Here can find the description: -# https://github.com/apple/darwin-xnu/blob/0a798f6738bc1db01281fc08ae024145e84df927/osfmk/i386/fpu.c#L176 +# https://github.com/apple/darwin-xnu/blob/2ff845c2e033bd0ff64b5b6aa6063a1f8f65aa32/osfmk/i386/fpu.c#L174 if (BUILD_ENABLE_AVX512 AND NOT APPLE) INCLUDE(ConfigSimdLevel) endif () diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 411c5d47b4..9a82d82c9d 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -37,13 +37,14 @@ if(ORC_CPU_FLAG STREQUAL "x86") # x86/amd64 compiler flags, msvc/gcc/clang if(MSVC) set(ORC_AVX512_FLAG "/arch:AVX512") + check_cxx_compiler_flag(${ORC_AVX512_FLAG} COMPILER_SUPPORT_AVX512) else() # "arch=native" selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. # Using -march=native enables all instruction subsets supported by the local machine. # Using -mtune=native produces code optimized for the local machine under the constraints of the selected instruction set. set(ORC_AVX512_FLAG "-march=native -mtune=native") + check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) endif() - check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) if(MINGW) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 @@ -91,7 +92,9 @@ if(ORC_CPU_FLAG STREQUAL "x86") if(ORC_SIMD_LEVEL STREQUAL "AVX512") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_AVX512_FLAG}") + message(STATUS "ORC_HAVE_RUNTIME_AVX512 defined, ORC_SIMD_LEVEL: ${ORC_SIMD_LEVEL}") + else() + message(STATUS "ORC_HAVE_RUNTIME_AVX512 not defined, ORC_SIMD_LEVEL: ${ORC_SIMD_LEVEL}") endif() endif() -message(STATUS "ORC_HAVE_RUNTIME_AVX512: ${ORC_HAVE_RUNTIME_AVX512}, ORC_SIMD_LEVEL: ${ORC_SIMD_LEVEL}") From 305a3179b0bcec50670315c8f1a6ae0f7ce3e54a Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 28 Mar 2023 14:22:06 -0400 Subject: [PATCH 62/80] Fix an error about if judgement in windows CI test --- .github/workflows/build_and_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 133031919d..42e2feffd3 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -74,7 +74,7 @@ jobs: cat /home/runner/work/orc/orc/build/java/rat.txt windows: - name: "Build and ${{ matrix.simd }} Test on Windows" + name: "C++ ${{ matrix.simd }} Test on Windows" runs-on: windows-2019 strategy: fail-fast: false @@ -95,7 +95,7 @@ jobs: run: | mkdir build cd build - if [ "${{ matrix.simd }}" = "General" ]; then + if ([ "${{ matrix.simd }}" = "General" ]); then cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF else cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON From 4debd508c2778221cda14d0546799f21c377ec03 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 29 Mar 2023 11:05:47 -0400 Subject: [PATCH 63/80] Add the align header and tailer code in the process of bit-unpacking. --- c++/src/BitUnpackerAvx512.hh | 16 +- c++/src/BpackingAvx512.cc | 2248 ++++------------------------------ c++/src/BpackingAvx512.hh | 14 + c++/src/RLEv2.hh | 4 +- 4 files changed, 272 insertions(+), 2010 deletions(-) diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh index 63017edaef..33f788f430 100644 --- a/c++/src/BitUnpackerAvx512.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -461,17 +461,17 @@ namespace orc { 0x0405060700010203, 0x0C0D0E0F08090A0B, 0x1415161710111213, 0x1C1D1E1F18191A1B, 0x2425262720212223, 0x2C2D2E2F28292A2B, 0x3435363730313233, 0x3C3D3E3F38393A3B}; - inline uint32_t getAlign(uint32_t start_bit, uint32_t base, uint32_t bitsize) { - uint32_t remnant = bitsize - start_bit; - uint32_t ret_value = 0xFFFFFFFF; - for (uint32_t i = 0u; i < bitsize; ++i) { - uint32_t test_value = (i * base) % bitsize; - if (test_value == remnant) { - ret_value = i; + inline uint32_t getAlign(uint32_t startBit, uint32_t base, uint32_t bitSize) { + uint32_t remnant = bitSize - startBit; + uint32_t retValue = 0xFFFFFFFF; + for (uint32_t i = 0u; i < bitSize; ++i) { + uint32_t testValue = (i * base) % bitSize; + if (testValue == remnant) { + retValue = i; break; } } - return ret_value; + return retValue; } inline uint64_t moveLen(uint64_t x, uint64_t y) { diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index 1a00477a9e..d9ca3ba87e 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -30,10 +30,106 @@ namespace orc { // PASS } + inline void UnpackAvx512::alignHeaderBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, uint64_t& len, + uint32_t& bitWidth, uint64_t& tailBitLen, + uint32_t& backupByteLen, uint64_t& numElements, + bool& resetBuf, const uint8_t*& srcPtr, + int64_t*& dstPtr, uint32_t bitMaxSize) { + if (startBit != 0) { + bufMoveByteLen += + moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = len; + resetBuf = false; + len -= numElements; + } else { + if (startBit != 0) { + numElements = + (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, + bitWidth); + resetBuf = true; + } else { + numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; + len -= numElements; + tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); + resetBuf = true; + } + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, bitMaxSize); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= + moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + dstPtr += align; + numElements -= align; + } + } + } + + inline void UnpackAvx512::alignTailerBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, uint64_t& len, + uint32_t& bitWidth, uint32_t& backupByteLen, + uint64_t& numElements, bool& resetBuf, + const uint8_t*& srcPtr, int64_t*& dstPtr) { + if (numElements > 0) { + if (startBit != 0) { + bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, + ORC_VECTOR_BYTE_WIDTH); + } else { + bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + } + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->bufferStart); + dstPtr += numElements; + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, + resetBuf, backupByteLen); + return; + } + + if (backupByteLen != 0) { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + dstPtr++; + backupByteLen = 0; + len--; + } else { + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); + } + + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->bufferStart); + } + void UnpackAvx512::vectorUnpack1(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 1; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); - uint32_t numElements = 0; + uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; @@ -43,54 +139,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_8Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -116,47 +167,15 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } void UnpackAvx512::vectorUnpack2(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 2; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); - uint32_t numElements = 0; + uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; @@ -166,54 +185,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_8Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -261,47 +235,15 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } void UnpackAvx512::vectorUnpack3(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 3; const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); - uint32_t numElements = 0; + uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; @@ -311,54 +253,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_8Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -406,40 +303,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -456,54 +321,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_8Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -539,40 +359,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -589,54 +377,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_8Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -684,40 +427,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -734,54 +445,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_8Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -829,40 +495,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -879,54 +513,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 8); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_8Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -974,40 +563,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -1024,54 +581,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_16Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1169,40 +681,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -1219,54 +699,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_16Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1287,55 +722,23 @@ namespace orc { // shifting elements so they start from the start of the word zmm = _mm512_srlv_epi16(zmm, shiftMask); - zmm = _mm512_and_si512(zmm, parseMask0); - - _mm512_storeu_si512(simdPtr, zmm); - - srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen -= 4 * bitWidth; - numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; - std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); - dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(simdPtr, zmm); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } } - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -1352,54 +755,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_16Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1506,40 +864,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -1556,54 +882,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_16Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1639,40 +920,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -1689,54 +938,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_16Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1843,40 +1047,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -1893,54 +1065,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_16Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1988,40 +1115,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -2038,54 +1133,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 16); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_16Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -2187,45 +1237,13 @@ namespace orc { bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; - std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); - dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } } - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -2323,54 +1341,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -2467,40 +1440,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -2517,54 +1458,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -2661,40 +1557,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -2711,54 +1575,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -2855,40 +1674,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -2905,54 +1692,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32u); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -2987,40 +1729,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -3037,54 +1747,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -3166,55 +1831,23 @@ namespace orc { lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); - zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); - - _mm512_storeu_si512(vectorBuf, zmm[0]); - - srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen -= 2 * bitWidth; - numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; - std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); - dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; - } - } - - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, + 0); + bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } } - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -3231,54 +1864,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -3375,40 +1963,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -3426,54 +1982,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -3570,40 +2081,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -3709,54 +2188,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -3853,40 +2287,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -3903,54 +2305,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -3985,40 +2342,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } @@ -4035,54 +2360,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - if (startBit != 0) { - bufMoveByteLen += - moveLen(len * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - len -= numElements; - } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / - bitWidth; - len -= numElements; - tailBitLen = fmod( - bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } - - if (startBit > 0) { - uint32_t align = getAlign(startBit, bitWidth, 32); - if (align > numElements) { - align = numElements; - } - if (align != 0) { - bufMoveByteLen -= - (align * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit) / ORC_VECTOR_BYTE_WIDTH; - plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - dstPtr += align; - numElements -= align; - } - } + alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr, + UNPACK_32Bit_MAX_SIZE); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -4187,40 +2467,8 @@ namespace orc { } } - if (numElements > 0) { - if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth - ORC_VECTOR_BYTE_WIDTH + startBit, - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); - } - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); - return; - } - - if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); - dstPtr++; - backupByteLen = 0; - len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - } - - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + numElements, resetBuf, srcPtr, dstPtr); } } diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index 8da18f934e..2ada55cd28 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -29,6 +29,9 @@ namespace orc { #define VECTOR_UNPACK_8BIT_MAX_NUM 64 #define VECTOR_UNPACK_16BIT_MAX_NUM 32 #define VECTOR_UNPACK_32BIT_MAX_NUM 16 +#define UNPACK_8Bit_MAX_SIZE 8 +#define UNPACK_16Bit_MAX_SIZE 16 +#define UNPACK_32Bit_MAX_SIZE 32 class RleDecoderV2; @@ -68,6 +71,17 @@ namespace orc { void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, uint64_t& startBit); + inline void alignHeaderBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, uint64_t& len, uint32_t& bitWidth, + uint64_t& tailBitLen, uint32_t& backupByteLen, + uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, + int64_t*& dstPtr, uint32_t bitMaxSize); + + inline void alignTailerBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, uint64_t& len, uint32_t& bitWidth, + uint32_t& backupByteLen, uint64_t& numElements, bool& resetBuf, + const uint8_t*& srcPtr, int64_t*& dstPtr); + private: RleDecoderV2* decoder; UnpackDefault unpackDefault; diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index a8ec3accc0..82a0954c13 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -228,8 +228,8 @@ namespace orc { DataBuffer literals; // Values of the current run }; - void RleDecoderV2::resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, bool resetBuf, - uint32_t backupByteLen) { + inline void RleDecoderV2::resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, + bool resetBuf, uint32_t backupByteLen) { uint64_t remainingLen = *bufEnd - *bufStart; int bufferLength = 0; const void* bufferPointer = nullptr; From 62d373cb2795872cb700c68e165c60e740b2c6f0 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 29 Mar 2023 11:17:11 -0400 Subject: [PATCH 64/80] Fix an error in the CI test yaml file on windows platform. --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 42e2feffd3..75fb39fa78 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -95,7 +95,7 @@ jobs: run: | mkdir build cd build - if ([ "${{ matrix.simd }}" = "General" ]); then + if ([[ "${{ matrix.simd }}" = "General" ]]); then cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF else cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON From 3dca1d74c4c0b5940abcdb465ffc24fbaf5dd87b Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 29 Mar 2023 11:35:51 -0400 Subject: [PATCH 65/80] Modified the AVX512 enable description in the README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 194c49511d..a32062828d 100644 --- a/README.md +++ b/README.md @@ -96,9 +96,6 @@ To build only the C++ library: To build the C++ library with AVX512 enabled: ```shell -Cmake option BUILD_ENABLE_AVX512 can be set to "ON" or (default value)"OFF" at the compile time. At compile time, it defines the SIMD level(AVX512) to be compiled into the binaries. -Environment variable ORC_USER_SIMD_LEVEL can be set to "AVX512" or (default value)"NONE" at the run time. At run time, it defines the SIMD level to dispatch the code which can apply SIMD optimization. -Note that if ORC_USER_SIMD_LEVEL is set to "NONE" at run time, AVX512 will not take effect at run time even if BUILD_ENABLE_AVX512 is set to "ON" at compile time. export ORC_USER_SIMD_LEVEL=AVX512 % mkdir build % cd build @@ -106,3 +103,8 @@ export ORC_USER_SIMD_LEVEL=AVX512 % make package % make test-out ``` +Cmake option BUILD_ENABLE_AVX512 can be set to "ON" or (default value)"OFF" at the compile time. At compile time, it defines the SIMD level(AVX512) to be compiled into the binaries. + +Environment variable ORC_USER_SIMD_LEVEL can be set to "AVX512" or (default value)"NONE" at the run time. At run time, it defines the SIMD level to dispatch the code which can apply SIMD optimization. + +Note that if ORC_USER_SIMD_LEVEL is set to "NONE" at run time, AVX512 will not take effect at run time even if BUILD_ENABLE_AVX512 is set to "ON" at compile time. From e23ca295337970718e024d4f468c5397e864b3af Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 31 Mar 2023 16:12:58 -0400 Subject: [PATCH 66/80] Add "shell: bash" in the CI test on windows, and make CI commands running within bash. --- .github/workflows/build_and_test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 75fb39fa78..fbbaf99d9e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -92,10 +92,11 @@ jobs: with: msbuild-architecture: x64 - name: "Test" + shell: bash run: | mkdir build cd build - if ([[ "${{ matrix.simd }}" = "General" ]]); then + if [ "${{ matrix.simd }}" = "General" ]; then cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF else cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON From 1a3221222dc9c44d65562754646c6a5892e80bdb Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 11 Apr 2023 15:35:54 -0400 Subject: [PATCH 67/80] 1. In function alignHeaderBoundary and alignTailerBoundary, rename parameter "len" to a meanful name 2. Modified the parameters order in alignHeaderBoundary and alignTailerBoundary 3. Modified function moveLen 4. Opt. some code in function alignHeaderBoundary --- c++/src/BitUnpackerAvx512.hh | 10 +- c++/src/BpackingAvx512.cc | 242 +++++++++++++++-------------------- c++/src/BpackingAvx512.hh | 16 ++- 3 files changed, 118 insertions(+), 150 deletions(-) diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh index 33f788f430..ad9ef74261 100644 --- a/c++/src/BitUnpackerAvx512.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -474,13 +474,9 @@ namespace orc { return retValue; } - inline uint64_t moveLen(uint64_t x, uint64_t y) { - uint64_t result = 0; - if (x % y == 0) { - result = x / y; - } else { - result = x / y + 1; - } + inline uint64_t moveByteLen(uint64_t numBits) { + uint64_t result = numBits / ORC_VECTOR_BYTE_WIDTH; + if (numBits % ORC_VECTOR_BYTE_WIDTH != 0) ++result; return result; } } // namespace orc diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index d9ca3ba87e..f65a7abfde 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -30,37 +30,32 @@ namespace orc { // PASS } - inline void UnpackAvx512::alignHeaderBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen, - uint64_t& bufRestByteLen, uint64_t& len, - uint32_t& bitWidth, uint64_t& tailBitLen, - uint32_t& backupByteLen, uint64_t& numElements, - bool& resetBuf, const uint8_t*& srcPtr, - int64_t*& dstPtr, uint32_t bitMaxSize) { + inline void UnpackAvx512::alignHeaderBoundary(const uint32_t bitWidth, uint32_t bitMaxSize, + uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, + uint64_t& remainingNumElements, + uint64_t& tailBitLen, uint32_t& backupByteLen, + uint64_t& numElements, bool& resetBuf, + const uint8_t*& srcPtr, int64_t*& dstPtr) { if (startBit != 0) { bufMoveByteLen += - moveLen(len * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + moveByteLen(remainingNumElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH); } else { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += moveByteLen(remainingNumElements * bitWidth); } if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; + numElements = remainingNumElements; resetBuf = false; - len -= numElements; + remainingNumElements = 0; } else { - if (startBit != 0) { - numElements = - (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + ORC_VECTOR_BYTE_WIDTH - startBit, - bitWidth); - resetBuf = true; - } else { - numElements = (bufRestByteLen * ORC_VECTOR_BYTE_WIDTH) / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } + uint64_t leadingBits = 0; + if (startBit != 0) leadingBits = ORC_VECTOR_BYTE_WIDTH - startBit; + uint64_t bufRestBitLen = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + leadingBits; + numElements = bufRestBitLen / bitWidth; + remainingNumElements -= numElements; + tailBitLen = fmod(bufRestBitLen, bitWidth); + resetBuf = true; } if (tailBitLen != 0) { @@ -74,8 +69,7 @@ namespace orc { align = numElements; } if (align != 0) { - bufMoveByteLen -= - moveLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveByteLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); srcPtr = reinterpret_cast(decoder->bufferStart); bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; @@ -85,17 +79,17 @@ namespace orc { } } - inline void UnpackAvx512::alignTailerBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen, - uint64_t& bufRestByteLen, uint64_t& len, - uint32_t& bitWidth, uint32_t& backupByteLen, - uint64_t& numElements, bool& resetBuf, - const uint8_t*& srcPtr, int64_t*& dstPtr) { + inline void UnpackAvx512::alignTailerBoundary(const uint32_t bitWidth, uint64_t& startBit, + uint64_t& bufMoveByteLen, uint64_t& bufRestByteLen, + uint64_t& remainingNumElements, + uint32_t& backupByteLen, uint64_t& numElements, + bool& resetBuf, const uint8_t*& srcPtr, + int64_t*& dstPtr) { if (numElements > 0) { if (startBit != 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH, - ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveByteLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH); } else { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveByteLen(numElements * bitWidth); } plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); srcPtr = reinterpret_cast(decoder->bufferStart); @@ -115,7 +109,7 @@ namespace orc { plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); dstPtr++; backupByteLen = 0; - len--; + remainingNumElements--; } else { decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, resetBuf, backupByteLen); @@ -139,9 +133,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_8Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -167,7 +160,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -185,9 +178,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_8Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -235,7 +227,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -253,9 +245,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_8Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -303,7 +294,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -321,9 +312,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_8Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -359,7 +349,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -377,9 +367,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_8Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -427,7 +416,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -445,9 +434,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_8Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -495,7 +483,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -513,9 +501,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_8Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -563,7 +550,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -581,9 +568,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_16Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -681,7 +667,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -699,9 +685,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_16Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -737,7 +722,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -755,9 +740,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_16Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -864,7 +848,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -882,9 +866,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_16Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -920,7 +903,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -938,9 +921,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_16Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1047,7 +1029,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1065,9 +1047,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_16Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1115,7 +1096,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1133,9 +1114,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_16Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1242,7 +1222,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1259,7 +1239,7 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += moveByteLen(len * bitWidth); if (bufMoveByteLen <= bufRestByteLen) { numElements = len; @@ -1296,7 +1276,7 @@ namespace orc { } if (numElements > 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveByteLen(numElements * bitWidth); unpackDefault.unrolledUnpack16(dstPtr, 0, numElements); srcPtr = reinterpret_cast(decoder->bufferStart); dstPtr += numElements; @@ -1341,9 +1321,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -1440,7 +1419,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1458,9 +1437,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -1557,7 +1535,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1575,9 +1553,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -1674,7 +1651,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1692,9 +1669,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -1729,7 +1705,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1747,9 +1723,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -1846,7 +1821,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1864,9 +1839,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -1963,7 +1937,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1982,9 +1956,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -2081,7 +2054,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2098,7 +2071,7 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += moveByteLen(len * bitWidth); if (bufMoveByteLen <= bufRestByteLen) { numElements = len; @@ -2143,7 +2116,7 @@ namespace orc { } if (numElements > 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveByteLen(numElements * bitWidth); unpackDefault.unrolledUnpack24(dstPtr, 0, numElements); srcPtr = reinterpret_cast(decoder->bufferStart); dstPtr += numElements; @@ -2188,9 +2161,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -2287,7 +2259,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2305,9 +2277,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -2342,7 +2313,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2360,9 +2331,8 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, tailBitLen, - backupByteLen, numElements, resetBuf, srcPtr, dstPtr, - UNPACK_32Bit_MAX_SIZE); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, + len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -2467,7 +2437,7 @@ namespace orc { } } - alignTailerBoundary(startBit, bufMoveByteLen, bufRestByteLen, len, bitWidth, backupByteLen, + alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2484,7 +2454,7 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - bufMoveByteLen += moveLen(len * bitWidth, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen += moveByteLen(len * bitWidth); if (bufMoveByteLen <= bufRestByteLen) { numElements = len; @@ -2520,7 +2490,7 @@ namespace orc { } if (numElements > 0) { - bufMoveByteLen -= moveLen(numElements * bitWidth, ORC_VECTOR_BYTE_WIDTH); + bufMoveByteLen -= moveByteLen(numElements * bitWidth); unpackDefault.unrolledUnpack32(dstPtr, 0, numElements); srcPtr = reinterpret_cast(decoder->bufferStart); dstPtr += numElements; diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index 2ada55cd28..10f2727c99 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -71,16 +71,18 @@ namespace orc { void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, uint64_t& startBit); - inline void alignHeaderBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen, - uint64_t& bufRestByteLen, uint64_t& len, uint32_t& bitWidth, + inline void alignHeaderBoundary(const uint32_t bitWidth, uint32_t bitMaxSize, + uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, uint64_t& remainingNumElements, uint64_t& tailBitLen, uint32_t& backupByteLen, uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, - int64_t*& dstPtr, uint32_t bitMaxSize); + int64_t*& dstPtr); - inline void alignTailerBoundary(uint64_t& startBit, uint64_t& bufMoveByteLen, - uint64_t& bufRestByteLen, uint64_t& len, uint32_t& bitWidth, - uint32_t& backupByteLen, uint64_t& numElements, bool& resetBuf, - const uint8_t*& srcPtr, int64_t*& dstPtr); + inline void alignTailerBoundary(const uint32_t bitWidth, uint64_t& startBit, + uint64_t& bufMoveByteLen, uint64_t& bufRestByteLen, + uint64_t& remainingNumElements, uint32_t& backupByteLen, + uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, + int64_t*& dstPtr); private: RleDecoderV2* decoder; From fc2c288284316ca71e657bcc51e73937160ec679 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 12 Apr 2023 09:23:17 -0400 Subject: [PATCH 68/80] Change the parameter bitMaxSize type to const uint32_t --- c++/src/BpackingAvx512.cc | 2 +- c++/src/BpackingAvx512.hh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index f65a7abfde..f0c60ea6de 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -30,7 +30,7 @@ namespace orc { // PASS } - inline void UnpackAvx512::alignHeaderBoundary(const uint32_t bitWidth, uint32_t bitMaxSize, + inline void UnpackAvx512::alignHeaderBoundary(const uint32_t bitWidth, const uint32_t bitMaxSize, uint64_t& startBit, uint64_t& bufMoveByteLen, uint64_t& bufRestByteLen, uint64_t& remainingNumElements, diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index 10f2727c99..48be9f1eac 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -71,7 +71,7 @@ namespace orc { void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, uint64_t& startBit); - inline void alignHeaderBoundary(const uint32_t bitWidth, uint32_t bitMaxSize, + inline void alignHeaderBoundary(const uint32_t bitWidth, const uint32_t bitMaxSize, uint64_t& startBit, uint64_t& bufMoveByteLen, uint64_t& bufRestByteLen, uint64_t& remainingNumElements, uint64_t& tailBitLen, uint32_t& backupByteLen, From 3468df0290d6bde2240241f5224fe728c4579338 Mon Sep 17 00:00:00 2001 From: wpleonardo Date: Thu, 13 Apr 2023 10:15:26 +0800 Subject: [PATCH 69/80] Change some parameter's type to const --- c++/src/BitUnpackerAvx512.hh | 270 +++++++++++++++++------------------ c++/src/BpackingAvx512.cc | 16 +-- c++/src/BpackingAvx512.hh | 2 +- c++/src/RLEv2.hh | 2 +- 4 files changed, 143 insertions(+), 147 deletions(-) diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh index ad9ef74261..c96e9e0218 100644 --- a/c++/src/BitUnpackerAvx512.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -50,414 +50,414 @@ namespace orc { (((x) + 31u) >> 5u) /**< Convert a number of bits to a number of double words */ // ------------------------------------ 3u ----------------------------------------- - static uint8_t shuffleIdxTable3u_0[64] = { + static const uint8_t shuffleIdxTable3u_0[64] = { 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u}; - static uint8_t shuffleIdxTable3u_1[64] = { + static const uint8_t shuffleIdxTable3u_1[64] = { 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u}; - static uint16_t shiftTable3u_0[32] = {13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, + static const uint16_t shiftTable3u_0[32] = {13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u}; - static uint16_t shiftTable3u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, + static const uint16_t shiftTable3u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; - static uint16_t permutexIdxTable3u[32] = {0u, 1u, 2u, 0x0, 0x0, 0x0, 0x0, 0x0, 3u, 4u, 5u, + static const uint16_t permutexIdxTable3u[32] = {0u, 1u, 2u, 0x0, 0x0, 0x0, 0x0, 0x0, 3u, 4u, 5u, 0x0, 0x0, 0x0, 0x0, 0x0, 6u, 7u, 8u, 0x0, 0x0, 0x0, 0x0, 0x0, 9u, 10u, 11u, 0x0, 0x0, 0x0, 0x0, 0x0}; // ------------------------------------ 5u ----------------------------------------- - static uint8_t shuffleIdxTable5u_0[64] = { + static const uint8_t shuffleIdxTable5u_0[64] = { 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; - static uint8_t shuffleIdxTable5u_1[64] = { + static const uint8_t shuffleIdxTable5u_1[64] = { 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u}; - static uint16_t shiftTable5u_0[32] = {11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, + static const uint16_t shiftTable5u_0[32] = {11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u}; - static uint16_t shiftTable5u_1[32] = {2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, + static const uint16_t shiftTable5u_1[32] = {2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u}; - static uint16_t permutexIdxTable5u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, + static const uint16_t permutexIdxTable5u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; // ------------------------------------ 6u ----------------------------------------- - static uint8_t shuffleIdxTable6u_0[64] = { + static const uint8_t shuffleIdxTable6u_0[64] = { 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; - static uint8_t shuffleIdxTable6u_1[64] = { + static const uint8_t shuffleIdxTable6u_1[64] = { 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u}; - static uint16_t shiftTable6u_0[32] = {10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, + static const uint16_t shiftTable6u_0[32] = {10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u}; - static uint16_t shiftTable6u_1[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, + static const uint16_t shiftTable6u_1[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; - static uint32_t permutexIdxTable6u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, + static const uint32_t permutexIdxTable6u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; // ------------------------------------ 7u ----------------------------------------- - static uint8_t shuffleIdxTable7u_0[64] = { + static const uint8_t shuffleIdxTable7u_0[64] = { 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u}; - static uint8_t shuffleIdxTable7u_1[64] = { + static const uint8_t shuffleIdxTable7u_1[64] = { 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u}; - static uint16_t shiftTable7u_0[32] = {9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, + static const uint16_t shiftTable7u_0[32] = {9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u}; - static uint16_t shiftTable7u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, + static const uint16_t shiftTable7u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; - static uint16_t permutexIdxTable7u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, + static const uint16_t permutexIdxTable7u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; // ------------------------------------ 9u ----------------------------------------- - static uint16_t permutexIdxTable9u_0[32] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, + static const uint16_t permutexIdxTable9u_0[32] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 13u, 14u, 14u, 15u, 15u, 16u, 16u, 17u}; - static uint16_t permutexIdxTable9u_1[32] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, + static const uint16_t permutexIdxTable9u_1[32] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u, 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 14u, 15u, 15u, 16u, 16u, 17u, 17u, 18u}; - static uint32_t shiftTable9u_0[16] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, + static const uint32_t shiftTable9u_0[16] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; - static uint32_t shiftTable9u_1[16] = {7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u, + static const uint32_t shiftTable9u_1[16] = {7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u}; - static uint8_t shuffleIdxTable9u_0[64] = { + static const uint8_t shuffleIdxTable9u_0[64] = { 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u}; - static uint16_t shiftTable9u_2[32] = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, + static const uint16_t shiftTable9u_2[32] = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; - static uint64_t gatherIdxTable9u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; + static const uint64_t gatherIdxTable9u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; // ------------------------------------ 10u ----------------------------------------- - static uint8_t shuffleIdxTable10u_0[64] = { + static const uint8_t shuffleIdxTable10u_0[64] = { 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; - static uint16_t shiftTable10u[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, + static const uint16_t shiftTable10u[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; - static uint16_t permutexIdxTable10u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, + static const uint16_t permutexIdxTable10u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; // ------------------------------------ 11u ----------------------------------------- - static uint16_t permutexIdxTable11u_0[32] = { + static const uint16_t permutexIdxTable11u_0[32] = { 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u, 12u, 13u, 13u, 14u, 15u, 16u, 16u, 17u, 17u, 18u, 19u, 20u, 20u, 21u}; - static uint16_t permutexIdxTable11u_1[32] = { + static const uint16_t permutexIdxTable11u_1[32] = { 0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u, 11u, 12u, 13u, 14u, 14u, 15u, 15u, 16u, 17u, 18u, 18u, 19u, 19u, 20u, 21u, 22u}; - static uint32_t shiftTable11u_0[16] = {0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u, + static const uint32_t shiftTable11u_0[16] = {0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u, 0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u}; - static uint32_t shiftTable11u_1[16] = {5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u, + static const uint32_t shiftTable11u_1[16] = {5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u, 5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u}; - static uint8_t shuffleIdxTable11u_0[64] = { + static const uint8_t shuffleIdxTable11u_0[64] = { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; - static uint8_t shuffleIdxTable11u_1[64] = { + static const uint8_t shuffleIdxTable11u_1[64] = { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u}; - static uint32_t shiftTable11u_2[16] = {21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, + static const uint32_t shiftTable11u_2[16] = {21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u}; - static uint32_t shiftTable11u_3[16] = {6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, + static const uint32_t shiftTable11u_3[16] = {6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u}; - static uint64_t gatherIdxTable11u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; + static const uint64_t gatherIdxTable11u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; // ------------------------------------ 12u ----------------------------------------- - static uint8_t shuffleIdxTable12u_0[64] = { + static const uint8_t shuffleIdxTable12u_0[64] = { 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; - static uint16_t shiftTable12u[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, + static const uint16_t shiftTable12u[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; - static uint32_t permutexIdxTable12u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, + static const uint32_t permutexIdxTable12u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; // ------------------------------------ 13u ----------------------------------------- - static uint16_t permutexIdxTable13u_0[32] = { + static const uint16_t permutexIdxTable13u_0[32] = { 0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u, 13u, 14u, 14u, 15u, 16u, 17u, 17u, 18u, 19u, 20u, 21u, 22u, 22u, 23u, 24u, 25u}; - static uint16_t permutexIdxTable13u_1[32] = { + static const uint16_t permutexIdxTable13u_1[32] = { 0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u, 13u, 14u, 15u, 16u, 17u, 18u, 18u, 19u, 20u, 21u, 21u, 22u, 23u, 24u, 25u, 26u}; - static uint32_t shiftTable13u_0[16] = {0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u, + static const uint32_t shiftTable13u_0[16] = {0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u, 0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u}; - static uint32_t shiftTable13u_1[16] = {3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u, + static const uint32_t shiftTable13u_1[16] = {3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u, 3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u}; - static uint8_t shuffleIdxTable13u_0[64] = { + static const uint8_t shuffleIdxTable13u_0[64] = { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; - static uint8_t shuffleIdxTable13u_1[64] = { + static const uint8_t shuffleIdxTable13u_1[64] = { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u}; - static uint32_t shiftTable13u_2[16] = {19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, + static const uint32_t shiftTable13u_2[16] = {19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u}; - static uint32_t shiftTable13u_3[16] = {10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, + static const uint32_t shiftTable13u_3[16] = {10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u}; - static uint64_t gatherIdxTable13u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; + static const uint64_t gatherIdxTable13u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; // ------------------------------------ 14u ----------------------------------------- - static uint8_t shuffleIdxTable14u_0[64] = { + static const uint8_t shuffleIdxTable14u_0[64] = { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; - static uint8_t shuffleIdxTable14u_1[64] = { + static const uint8_t shuffleIdxTable14u_1[64] = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u}; - static uint32_t shiftTable14u_0[16] = {18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, + static const uint32_t shiftTable14u_0[16] = {18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u}; - static uint32_t shiftTable14u_1[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, + static const uint32_t shiftTable14u_1[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; - static uint16_t permutexIdxTable14u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, + static const uint16_t permutexIdxTable14u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; // ------------------------------------ 15u ----------------------------------------- - static uint16_t permutexIdxTable15u_0[32] = { + static const uint16_t permutexIdxTable15u_0[32] = { 0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u}; - static uint16_t permutexIdxTable15u_1[32] = { + static const uint16_t permutexIdxTable15u_1[32] = { 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u, 30u}; - static uint32_t shiftTable15u_0[16] = {0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u, + static const uint32_t shiftTable15u_0[16] = {0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u}; - static uint32_t shiftTable15u_1[16] = {1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u, + static const uint32_t shiftTable15u_1[16] = {1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u, 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; - static uint8_t shuffleIdxTable15u_0[64] = { + static const uint8_t shuffleIdxTable15u_0[64] = { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u}; - static uint8_t shuffleIdxTable15u_1[64] = { + static const uint8_t shuffleIdxTable15u_1[64] = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u}; - static uint32_t shiftTable15u_2[16] = {17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, + static const uint32_t shiftTable15u_2[16] = {17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u}; - static uint32_t shiftTable15u_3[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, + static const uint32_t shiftTable15u_3[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; - static uint64_t gatherIdxTable15u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; + static const uint64_t gatherIdxTable15u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; // ------------------------------------ 17u ----------------------------------------- - static uint32_t permutexIdxTable17u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + static const uint32_t permutexIdxTable17u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; - static uint32_t permutexIdxTable17u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + static const uint32_t permutexIdxTable17u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; - static uint64_t shiftTable17u_0[8] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; - static uint64_t shiftTable17u_1[8] = {15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; + static const uint64_t shiftTable17u_0[8] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; + static const uint64_t shiftTable17u_1[8] = {15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; - static uint8_t shuffleIdxTable17u_0[64] = { + static const uint8_t shuffleIdxTable17u_0[64] = { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; - static uint32_t shiftTable17u_2[16] = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + static const uint32_t shiftTable17u_2[16] = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u}; - static uint64_t gatherIdxTable17u[8] = {0u, 8u, 8u, 16u, 17u, 25u, 25u, 33u}; + static const uint64_t gatherIdxTable17u[8] = {0u, 8u, 8u, 16u, 17u, 25u, 25u, 33u}; // ------------------------------------ 18u ----------------------------------------- - static uint32_t permutexIdxTable18u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + static const uint32_t permutexIdxTable18u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; - static uint32_t permutexIdxTable18u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + static const uint32_t permutexIdxTable18u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; - static uint64_t shiftTable18u_0[8] = {0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u}; - static uint64_t shiftTable18u_1[8] = {14u, 10u, 6u, 2u, 30u, 26u, 22u, 18u}; + static const uint64_t shiftTable18u_0[8] = {0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u}; + static const uint64_t shiftTable18u_1[8] = {14u, 10u, 6u, 2u, 30u, 26u, 22u, 18u}; - static uint8_t shuffleIdxTable18u_0[64] = { + static const uint8_t shuffleIdxTable18u_0[64] = { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; - static uint32_t shiftTable18u_2[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, + static const uint32_t shiftTable18u_2[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; - static uint64_t gatherIdxTable18u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; + static const uint64_t gatherIdxTable18u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; // ------------------------------------ 19u ----------------------------------------- - static uint32_t permutexIdxTable19u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + static const uint32_t permutexIdxTable19u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u}; - static uint32_t permutexIdxTable19u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, + static const uint32_t permutexIdxTable19u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; - static uint64_t shiftTable19u_0[8] = {0u, 6u, 12u, 18u, 24u, 30u, 4u, 10u}; - static uint64_t shiftTable19u_1[8] = {13u, 7u, 1u, 27u, 21u, 15u, 9u, 3u}; + static const uint64_t shiftTable19u_0[8] = {0u, 6u, 12u, 18u, 24u, 30u, 4u, 10u}; + static const uint64_t shiftTable19u_1[8] = {13u, 7u, 1u, 27u, 21u, 15u, 9u, 3u}; - static uint8_t shuffleIdxTable19u_0[64] = { + static const uint8_t shuffleIdxTable19u_0[64] = { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; - static uint32_t shiftTable19u_2[16] = {13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u, + static const uint32_t shiftTable19u_2[16] = {13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u, 13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u}; - static uint64_t gatherIdxTable19u[8] = {0u, 8u, 9u, 17u, 19u, 27u, 28u, 36u}; + static const uint64_t gatherIdxTable19u[8] = {0u, 8u, 9u, 17u, 19u, 27u, 28u, 36u}; // ------------------------------------ 20u ----------------------------------------- - static uint8_t shuffleIdxTable20u_0[64] = { + static const uint8_t shuffleIdxTable20u_0[64] = { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; - static uint32_t shiftTable20u[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, + static const uint32_t shiftTable20u[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; - static uint16_t permutexIdxTable20u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, + static const uint16_t permutexIdxTable20u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; // ------------------------------------ 21u ----------------------------------------- - static uint32_t permutexIdxTable21u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + static const uint32_t permutexIdxTable21u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u}; - static uint32_t permutexIdxTable21u_1[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, + static const uint32_t permutexIdxTable21u_1[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 9u, 10u}; - static uint64_t shiftTable21u_0[8] = {0u, 10u, 20u, 30u, 8u, 18u, 28u, 6u}; - static uint64_t shiftTable21u_1[8] = {11u, 1u, 23u, 13u, 3u, 25u, 15u, 5u}; + static const uint64_t shiftTable21u_0[8] = {0u, 10u, 20u, 30u, 8u, 18u, 28u, 6u}; + static const uint64_t shiftTable21u_1[8] = {11u, 1u, 23u, 13u, 3u, 25u, 15u, 5u}; - static uint8_t shuffleIdxTable21u_0[64] = { + static const uint8_t shuffleIdxTable21u_0[64] = { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; - static uint32_t shiftTable21u_2[16] = {11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u, + static const uint32_t shiftTable21u_2[16] = {11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u, 11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u}; - static uint64_t gatherIdxTable21u[8] = {0u, 8u, 10u, 18u, 21u, 29u, 31u, 39u}; + static const uint64_t gatherIdxTable21u[8] = {0u, 8u, 10u, 18u, 21u, 29u, 31u, 39u}; // ------------------------------------ 22u ----------------------------------------- - static uint32_t permutexIdxTable22u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, + static const uint32_t permutexIdxTable22u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u}; - static uint32_t permutexIdxTable22u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, + static const uint32_t permutexIdxTable22u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u}; - static uint64_t shiftTable22u_0[8] = {0u, 12u, 24u, 4u, 16u, 28u, 8u, 20u}; - static uint64_t shiftTable22u_1[8] = {10u, 30u, 18u, 6u, 26u, 14u, 2u, 22u}; + static const uint64_t shiftTable22u_0[8] = {0u, 12u, 24u, 4u, 16u, 28u, 8u, 20u}; + static const uint64_t shiftTable22u_1[8] = {10u, 30u, 18u, 6u, 26u, 14u, 2u, 22u}; - static uint8_t shuffleIdxTable22u_0[64] = { + static const uint8_t shuffleIdxTable22u_0[64] = { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; - static uint32_t shiftTable22u_2[16] = {10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, + static const uint32_t shiftTable22u_2[16] = {10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u}; - static uint64_t gatherIdxTable22u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; + static const uint64_t gatherIdxTable22u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; // ------------------------------------ 23u ----------------------------------------- - static uint32_t permutexIdxTable23u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, + static const uint32_t permutexIdxTable23u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u}; - static uint32_t permutexIdxTable23u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 5u, 6u, + static const uint32_t permutexIdxTable23u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u}; - static uint64_t shiftTable23u_0[8] = {0u, 14u, 28u, 10u, 24u, 6u, 20u, 2u}; - static uint64_t shiftTable23u_1[8] = {9u, 27u, 13u, 31u, 17u, 3u, 21u, 7u}; + static const uint64_t shiftTable23u_0[8] = {0u, 14u, 28u, 10u, 24u, 6u, 20u, 2u}; + static const uint64_t shiftTable23u_1[8] = {9u, 27u, 13u, 31u, 17u, 3u, 21u, 7u}; - static uint8_t shuffleIdxTable23u_0[64] = { + static const uint8_t shuffleIdxTable23u_0[64] = { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; - static uint32_t shiftTable23u_2[16] = {9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, + static const uint32_t shiftTable23u_2[16] = {9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u}; - static uint64_t gatherIdxTable23u[8] = {0u, 8u, 11u, 19u, 23u, 31u, 34u, 42u}; + static const uint64_t gatherIdxTable23u[8] = {0u, 8u, 11u, 19u, 23u, 31u, 34u, 42u}; // ------------------------------------ 24u ----------------------------------------- - static uint8_t shuffleIdxTable24u_0[64] = { + static const uint8_t shuffleIdxTable24u_0[64] = { 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF}; - static uint32_t permutexIdxTable24u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, + static const uint32_t permutexIdxTable24u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; // ------------------------------------ 26u ----------------------------------------- - static uint32_t permutexIdxTable26u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, + static const uint32_t permutexIdxTable26u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u}; - static uint32_t permutexIdxTable26u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, + static const uint32_t permutexIdxTable26u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u}; - static uint64_t shiftTable26u_0[8] = {0u, 20u, 8u, 28u, 16u, 4u, 24u, 12u}; - static uint64_t shiftTable26u_1[8] = {6u, 18u, 30u, 10u, 22u, 2u, 14u, 26u}; + static const uint64_t shiftTable26u_0[8] = {0u, 20u, 8u, 28u, 16u, 4u, 24u, 12u}; + static const uint64_t shiftTable26u_1[8] = {6u, 18u, 30u, 10u, 22u, 2u, 14u, 26u}; - static uint8_t shuffleIdxTable26u_0[64] = { + static const uint8_t shuffleIdxTable26u_0[64] = { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; - static uint32_t shiftTable26u_2[16] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, + static const uint32_t shiftTable26u_2[16] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; - static uint64_t gatherIdxTable26u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; + static const uint64_t gatherIdxTable26u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; // ------------------------------------ 28u ----------------------------------------- - static uint8_t shuffleIdxTable28u_0[64] = { + static const uint8_t shuffleIdxTable28u_0[64] = { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; - static uint32_t shiftTable28u[16] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, + static const uint32_t shiftTable28u[16] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; - static uint16_t permutexIdxTable28u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, + static const uint16_t permutexIdxTable28u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; // ------------------------------------ 30u ----------------------------------------- - static uint32_t permutexIdxTable30u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, + static const uint32_t permutexIdxTable30u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u}; - static uint32_t permutexIdxTable30u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, + static const uint32_t permutexIdxTable30u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u}; - static uint64_t shiftTable30u_0[8] = {0u, 28u, 24u, 20u, 16u, 12u, 8u, 4u}; - static uint64_t shiftTable30u_1[8] = {2u, 6u, 10u, 14u, 18u, 22u, 26u, 30u}; + static const uint64_t shiftTable30u_0[8] = {0u, 28u, 24u, 20u, 16u, 12u, 8u, 4u}; + static const uint64_t shiftTable30u_1[8] = {2u, 6u, 10u, 14u, 18u, 22u, 26u, 30u}; - static uint8_t shuffleIdxTable30u_0[64] = { + static const uint8_t shuffleIdxTable30u_0[64] = { 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u}; - static uint8_t shuffleIdxTable30u_1[64] = { + static const uint8_t shuffleIdxTable30u_1[64] = { 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u}; - static uint64_t shiftTable30u_2[8] = {34u, 30u, 34u, 30u, 34u, 30u, 34u, 30u}; - static uint64_t shiftTable30u_3[8] = {28u, 24u, 28u, 24u, 28u, 24u, 28u, 24u}; - static uint64_t gatherIdxTable30u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; + static const uint64_t shiftTable30u_2[8] = {34u, 30u, 34u, 30u, 34u, 30u, 34u, 30u}; + static const uint64_t shiftTable30u_3[8] = {28u, 24u, 28u, 24u, 28u, 24u, 28u, 24u}; + static const uint64_t gatherIdxTable30u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; - static uint64_t nibbleReverseTable[8] = { + static const uint64_t nibbleReverseTable[8] = { 0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 0x0F070B030D050901}; - static uint64_t reverseMaskTable1u[8] = { + static const uint64_t reverseMaskTable1u[8] = { 0x0001020304050607, 0x08090A0B0C0D0E0F, 0x1011121314151617, 0x18191A1B1C1D1E1F, 0x2021222324252627, 0x28292A2B2C2D2E2F, 0x3031323334353637, 0x38393A3B3C3D3E3F}; - static uint64_t reverseMaskTable16u[8] = { + static const uint64_t reverseMaskTable16u[8] = { 0x0607040502030001, 0x0E0F0C0D0A0B0809, 0x1617141512131011, 0x1E1F1C1D1A1B1819, 0x2627242522232021, 0x2E2F2C2D2A2B2829, 0x3637343532333031, 0x3E3F3C3D3A3B3839}; - static uint64_t reverseMaskTable32u[8] = { + static const uint64_t reverseMaskTable32u[8] = { 0x0405060700010203, 0x0C0D0E0F08090A0B, 0x1415161710111213, 0x1C1D1E1F18191A1B, 0x2425262720212223, 0x2C2D2E2F28292A2B, 0x3435363730313233, 0x3C3D3E3F38393A3B}; diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index f0c60ea6de..4ec8ff869f 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -37,12 +37,11 @@ namespace orc { uint64_t& tailBitLen, uint32_t& backupByteLen, uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, int64_t*& dstPtr) { + uint64_t numBits = remainingNumElements * bitWidth; if (startBit != 0) { - bufMoveByteLen += - moveByteLen(remainingNumElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen += moveByteLen(remainingNumElements * bitWidth); + numBits += startBit - ORC_VECTOR_BYTE_WIDTH; } + bufMoveByteLen += moveByteLen(numBits); if (bufMoveByteLen <= bufRestByteLen) { numElements = remainingNumElements; @@ -103,16 +102,13 @@ namespace orc { return; } + decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, + resetBuf, backupByteLen); if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); dstPtr++; backupByteLen = 0; remainingNumElements--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); } bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; @@ -1292,7 +1288,7 @@ namespace orc { if (backupByteLen != 0) { decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, resetBuf, backupByteLen); - ; + unpackDefault.unrolledUnpack16(dstPtr, 0, 1); dstPtr++; backupByteLen = 0; diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index 48be9f1eac..aad178b13c 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -88,7 +88,7 @@ namespace orc { RleDecoderV2* decoder; UnpackDefault unpackDefault; - // Used by vectorially bit-unpacking data + // Used by vectorized bit-unpacking data uint32_t vectorBuf[VECTOR_UNPACK_32BIT_MAX_NUM + 1]; }; diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index 82a0954c13..4786f7074d 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -238,7 +238,7 @@ namespace orc { inputStream->BackUp(backupByteLen); } - if (len >= remainingLen && resetBuf == true) { + if (len >= remainingLen && resetBuf) { if (!inputStream->Next(&bufferPointer, &bufferLength)) { throw ParseError("bad read in RleDecoderV2::resetBufferStart"); } From 93feaf945ae847c33c2ee5db22af94c0e567b0ae Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 17 Apr 2023 11:48:20 -0400 Subject: [PATCH 70/80] 1. Changed the parameters bufferStart, bufferEnd, bitsLeft and curByte back to private 2. Added functions to get and set these private parameters --- c++/src/BitUnpackerAvx512.hh | 186 ++++++++-------- c++/src/BpackingAvx512.cc | 399 +++++++++++++++-------------------- c++/src/BpackingDefault.cc | 153 +++++++------- c++/src/RLEv2.hh | 62 ++++-- c++/src/RleDecoderV2.cc | 34 +-- 5 files changed, 405 insertions(+), 429 deletions(-) diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh index c96e9e0218..5b04866718 100644 --- a/c++/src/BitUnpackerAvx512.hh +++ b/c++/src/BitUnpackerAvx512.hh @@ -59,14 +59,14 @@ namespace orc { 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u}; static const uint16_t shiftTable3u_0[32] = {13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, - 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, - 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u}; + 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, + 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u}; static const uint16_t shiftTable3u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, - 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, - 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; - static const uint16_t permutexIdxTable3u[32] = {0u, 1u, 2u, 0x0, 0x0, 0x0, 0x0, 0x0, 3u, 4u, 5u, - 0x0, 0x0, 0x0, 0x0, 0x0, 6u, 7u, 8u, 0x0, 0x0, 0x0, - 0x0, 0x0, 9u, 10u, 11u, 0x0, 0x0, 0x0, 0x0, 0x0}; + 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, + 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static const uint16_t permutexIdxTable3u[32] = { + 0u, 1u, 2u, 0x0, 0x0, 0x0, 0x0, 0x0, 3u, 4u, 5u, 0x0, 0x0, 0x0, 0x0, 0x0, + 6u, 7u, 8u, 0x0, 0x0, 0x0, 0x0, 0x0, 9u, 10u, 11u, 0x0, 0x0, 0x0, 0x0, 0x0}; // ------------------------------------ 5u ----------------------------------------- static const uint8_t shuffleIdxTable5u_0[64] = { @@ -78,14 +78,14 @@ namespace orc { 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u}; static const uint16_t shiftTable5u_0[32] = {11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, - 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, - 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u}; + 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, + 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u}; static const uint16_t shiftTable5u_1[32] = {2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, - 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, - 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u}; - static const uint16_t permutexIdxTable5u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, - 8u, 9u, 0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0, - 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, + 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u}; + static const uint16_t permutexIdxTable5u[32] = { + 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, + 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; // ------------------------------------ 6u ----------------------------------------- static const uint8_t shuffleIdxTable6u_0[64] = { @@ -99,13 +99,13 @@ namespace orc { 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u}; static const uint16_t shiftTable6u_0[32] = {10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, - 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, - 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u}; + 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, + 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u}; static const uint16_t shiftTable6u_1[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, - 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, - 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; + 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; static const uint32_t permutexIdxTable6u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, - 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; // ------------------------------------ 7u ----------------------------------------- static const uint8_t shuffleIdxTable7u_0[64] = { @@ -119,34 +119,34 @@ namespace orc { 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u}; static const uint16_t shiftTable7u_0[32] = {9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, - 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, - 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u}; + 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, + 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u}; static const uint16_t shiftTable7u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, - 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, - 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; - static const uint16_t permutexIdxTable7u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, - 10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u, - 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, + 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static const uint16_t permutexIdxTable7u[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, + 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; // ------------------------------------ 9u ----------------------------------------- - static const uint16_t permutexIdxTable9u_0[32] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, - 6u, 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u, 11u, 12u, - 12u, 13u, 13u, 14u, 14u, 15u, 15u, 16u, 16u, 17u}; - static const uint16_t permutexIdxTable9u_1[32] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, - 7u, 7u, 8u, 8u, 9u, 9u, 10u, 10u, 11u, 11u, 12u, - 12u, 13u, 14u, 15u, 15u, 16u, 16u, 17u, 17u, 18u}; + static const uint16_t permutexIdxTable9u_0[32] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u, + 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 13u, 14u, 14u, 15u, 15u, 16u, 16u, 17u}; + static const uint16_t permutexIdxTable9u_1[32] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u, + 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 14u, 15u, 15u, 16u, 16u, 17u, 17u, 18u}; static const uint32_t shiftTable9u_0[16] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, - 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; + 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; static const uint32_t shiftTable9u_1[16] = {7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u, - 7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u}; + 7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u}; static const uint8_t shuffleIdxTable9u_0[64] = { 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u}; static const uint16_t shiftTable9u_2[32] = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, - 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, - 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, + 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; static const uint64_t gatherIdxTable9u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; // ------------------------------------ 10u ----------------------------------------- @@ -155,11 +155,11 @@ namespace orc { 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; static const uint16_t shiftTable10u[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, - 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, - 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; - static const uint16_t permutexIdxTable10u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, - 8u, 9u, 0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0, - 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, + 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static const uint16_t permutexIdxTable10u[32] = { + 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, + 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; // ------------------------------------ 11u ----------------------------------------- static const uint16_t permutexIdxTable11u_0[32] = { @@ -169,9 +169,9 @@ namespace orc { 0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u, 11u, 12u, 13u, 14u, 14u, 15u, 15u, 16u, 17u, 18u, 18u, 19u, 19u, 20u, 21u, 22u}; static const uint32_t shiftTable11u_0[16] = {0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u, - 0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u}; + 0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u}; static const uint32_t shiftTable11u_1[16] = {5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u, - 5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u}; + 5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u}; static const uint8_t shuffleIdxTable11u_0[64] = { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, @@ -184,9 +184,9 @@ namespace orc { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u}; static const uint32_t shiftTable11u_2[16] = {21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, - 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u}; + 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u}; static const uint32_t shiftTable11u_3[16] = {6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, - 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u}; + 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u}; static const uint64_t gatherIdxTable11u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; // ------------------------------------ 12u ----------------------------------------- @@ -196,10 +196,10 @@ namespace orc { 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; static const uint16_t shiftTable12u[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, - 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, - 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; + 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; static const uint32_t permutexIdxTable12u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, - 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; // ------------------------------------ 13u ----------------------------------------- static const uint16_t permutexIdxTable13u_0[32] = { @@ -209,9 +209,9 @@ namespace orc { 0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u, 13u, 14u, 15u, 16u, 17u, 18u, 18u, 19u, 20u, 21u, 21u, 22u, 23u, 24u, 25u, 26u}; static const uint32_t shiftTable13u_0[16] = {0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u, - 0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u}; + 0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u}; static const uint32_t shiftTable13u_1[16] = {3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u, - 3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u}; + 3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u}; static const uint8_t shuffleIdxTable13u_0[64] = { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, @@ -224,9 +224,9 @@ namespace orc { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u}; static const uint32_t shiftTable13u_2[16] = {19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, - 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u}; + 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u}; static const uint32_t shiftTable13u_3[16] = {10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, - 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u}; + 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u}; static const uint64_t gatherIdxTable13u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; // ------------------------------------ 14u ----------------------------------------- @@ -241,12 +241,12 @@ namespace orc { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u}; static const uint32_t shiftTable14u_0[16] = {18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, - 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u}; + 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u}; static const uint32_t shiftTable14u_1[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, - 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; - static const uint16_t permutexIdxTable14u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, - 10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u, - 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; + static const uint16_t permutexIdxTable14u[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, + 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; // ------------------------------------ 15u ----------------------------------------- static const uint16_t permutexIdxTable15u_0[32] = { @@ -256,9 +256,9 @@ namespace orc { 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u, 30u}; static const uint32_t shiftTable15u_0[16] = {0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u, - 0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u}; + 0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u}; static const uint32_t shiftTable15u_1[16] = {1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u, - 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; + 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; static const uint8_t shuffleIdxTable15u_0[64] = { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, @@ -271,16 +271,16 @@ namespace orc { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u}; static const uint32_t shiftTable15u_2[16] = {17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, - 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u}; + 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u}; static const uint32_t shiftTable15u_3[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, - 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; + 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; static const uint64_t gatherIdxTable15u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; // ------------------------------------ 17u ----------------------------------------- static const uint32_t permutexIdxTable17u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, - 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; + 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; static const uint32_t permutexIdxTable17u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, - 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; + 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; static const uint64_t shiftTable17u_0[8] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; static const uint64_t shiftTable17u_1[8] = {15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; @@ -289,14 +289,14 @@ namespace orc { 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; static const uint32_t shiftTable17u_2[16] = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u}; + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u}; static const uint64_t gatherIdxTable17u[8] = {0u, 8u, 8u, 16u, 17u, 25u, 25u, 33u}; // ------------------------------------ 18u ----------------------------------------- static const uint32_t permutexIdxTable18u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, - 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; + 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; static const uint32_t permutexIdxTable18u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, - 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; + 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; static const uint64_t shiftTable18u_0[8] = {0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u}; static const uint64_t shiftTable18u_1[8] = {14u, 10u, 6u, 2u, 30u, 26u, 22u, 18u}; @@ -305,14 +305,14 @@ namespace orc { 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; static const uint32_t shiftTable18u_2[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, - 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; + 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; static const uint64_t gatherIdxTable18u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; // ------------------------------------ 19u ----------------------------------------- static const uint32_t permutexIdxTable19u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, - 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u}; + 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u}; static const uint32_t permutexIdxTable19u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, - 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; + 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; static const uint64_t shiftTable19u_0[8] = {0u, 6u, 12u, 18u, 24u, 30u, 4u, 10u}; static const uint64_t shiftTable19u_1[8] = {13u, 7u, 1u, 27u, 21u, 15u, 9u, 3u}; @@ -321,7 +321,7 @@ namespace orc { 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; static const uint32_t shiftTable19u_2[16] = {13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u, - 13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u}; + 13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u}; static const uint64_t gatherIdxTable19u[8] = {0u, 8u, 9u, 17u, 19u, 27u, 28u, 36u}; // ------------------------------------ 20u ----------------------------------------- @@ -330,16 +330,16 @@ namespace orc { 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; static const uint32_t shiftTable20u[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, - 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; - static const uint16_t permutexIdxTable20u[32] = {0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, - 8u, 9u, 0x0, 0x0, 0x0, 10u, 11u, 12u, 13u, 14u, 0x0, - 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; + static const uint16_t permutexIdxTable20u[32] = { + 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, + 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; // ------------------------------------ 21u ----------------------------------------- static const uint32_t permutexIdxTable21u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, - 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u}; + 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u}; static const uint32_t permutexIdxTable21u_1[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, - 5u, 6u, 7u, 8u, 8u, 9u, 9u, 10u}; + 5u, 6u, 7u, 8u, 8u, 9u, 9u, 10u}; static const uint64_t shiftTable21u_0[8] = {0u, 10u, 20u, 30u, 8u, 18u, 28u, 6u}; static const uint64_t shiftTable21u_1[8] = {11u, 1u, 23u, 13u, 3u, 25u, 15u, 5u}; @@ -348,14 +348,14 @@ namespace orc { 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; static const uint32_t shiftTable21u_2[16] = {11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u, - 11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u}; + 11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u}; static const uint64_t gatherIdxTable21u[8] = {0u, 8u, 10u, 18u, 21u, 29u, 31u, 39u}; // ------------------------------------ 22u ----------------------------------------- static const uint32_t permutexIdxTable22u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, - 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u}; + 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u}; static const uint32_t permutexIdxTable22u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, - 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u}; + 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u}; static const uint64_t shiftTable22u_0[8] = {0u, 12u, 24u, 4u, 16u, 28u, 8u, 20u}; static const uint64_t shiftTable22u_1[8] = {10u, 30u, 18u, 6u, 26u, 14u, 2u, 22u}; @@ -365,14 +365,14 @@ namespace orc { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; static const uint32_t shiftTable22u_2[16] = {10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, - 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u}; + 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u}; static const uint64_t gatherIdxTable22u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; // ------------------------------------ 23u ----------------------------------------- static const uint32_t permutexIdxTable23u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, - 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u}; + 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u}; static const uint32_t permutexIdxTable23u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 5u, 6u, - 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u}; + 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u}; static const uint64_t shiftTable23u_0[8] = {0u, 14u, 28u, 10u, 24u, 6u, 20u, 2u}; static const uint64_t shiftTable23u_1[8] = {9u, 27u, 13u, 31u, 17u, 3u, 21u, 7u}; @@ -382,7 +382,7 @@ namespace orc { 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; static const uint32_t shiftTable23u_2[16] = {9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, - 9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u}; + 9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u}; static const uint64_t gatherIdxTable23u[8] = {0u, 8u, 11u, 19u, 23u, 31u, 34u, 42u}; // ------------------------------------ 24u ----------------------------------------- @@ -392,13 +392,13 @@ namespace orc { 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF}; static const uint32_t permutexIdxTable24u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, - 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; // ------------------------------------ 26u ----------------------------------------- static const uint32_t permutexIdxTable26u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, - 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u}; + 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u}; static const uint32_t permutexIdxTable26u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, - 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u}; + 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u}; static const uint64_t shiftTable26u_0[8] = {0u, 20u, 8u, 28u, 16u, 4u, 24u, 12u}; static const uint64_t shiftTable26u_1[8] = {6u, 18u, 30u, 10u, 22u, 2u, 14u, 26u}; @@ -408,7 +408,7 @@ namespace orc { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; static const uint32_t shiftTable26u_2[16] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, - 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; static const uint64_t gatherIdxTable26u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; // ------------------------------------ 28u ----------------------------------------- @@ -418,16 +418,16 @@ namespace orc { 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; static const uint32_t shiftTable28u[16] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, - 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; - static const uint16_t permutexIdxTable28u[32] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, - 10u, 11u, 12u, 13u, 0x0, 14u, 15u, 16u, 17u, 18u, 19u, - 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; + static const uint16_t permutexIdxTable28u[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, + 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; // ------------------------------------ 30u ----------------------------------------- static const uint32_t permutexIdxTable30u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, - 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u}; + 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u}; static const uint32_t permutexIdxTable30u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, - 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u}; + 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u}; static const uint64_t shiftTable30u_0[8] = {0u, 28u, 24u, 20u, 16u, 12u, 8u, 4u}; static const uint64_t shiftTable30u_1[8] = {2u, 6u, 10u, 14u, 18u, 22u, 26u, 30u}; diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index 4ec8ff869f..506bd814f1 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -70,8 +70,8 @@ namespace orc { if (align != 0) { bufMoveByteLen -= moveByteLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH); plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + srcPtr = reinterpret_cast(decoder->getBufStart()); + bufRestByteLen = decoder->bufLength(); dstPtr += align; numElements -= align; } @@ -85,25 +85,23 @@ namespace orc { bool& resetBuf, const uint8_t*& srcPtr, int64_t*& dstPtr) { if (numElements > 0) { + uint64_t numBits = numElements * bitWidth; if (startBit != 0) { - bufMoveByteLen -= moveByteLen(numElements * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH); - } else { - bufMoveByteLen -= moveByteLen(numElements * bitWidth); + numBits += startBit - ORC_VECTOR_BYTE_WIDTH; } + bufMoveByteLen -= moveByteLen(numBits); plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); - srcPtr = reinterpret_cast(decoder->bufferStart); + srcPtr = reinterpret_cast(decoder->getBufStart()); dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufRestByteLen = decoder->bufLength(); } if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); + decoder->resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); return; } - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); + decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); if (backupByteLen != 0) { plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); dstPtr++; @@ -111,18 +109,18 @@ namespace orc { remainingNumElements--; } - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufRestByteLen = decoder->bufLength(); bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + srcPtr = reinterpret_cast(decoder->getBufStart()); } void UnpackAvx512::vectorUnpack1(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 1; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -146,9 +144,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, srcmm); srcPtr += 8 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 8 * bitWidth; numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); @@ -163,11 +160,11 @@ namespace orc { void UnpackAvx512::vectorUnpack2(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 2; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -213,9 +210,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, srcmm0); srcPtr += 8 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 8 * bitWidth; numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); @@ -230,11 +226,11 @@ namespace orc { void UnpackAvx512::vectorUnpack3(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 3; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -280,9 +276,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 8 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 8 * bitWidth; numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); @@ -297,11 +292,11 @@ namespace orc { void UnpackAvx512::vectorUnpack4(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 4; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -335,9 +330,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, srcmm0); srcPtr += 8 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 8 * bitWidth; numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); @@ -352,11 +346,11 @@ namespace orc { void UnpackAvx512::vectorUnpack5(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 5; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -402,9 +396,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 8 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 8 * bitWidth; numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); @@ -419,11 +412,11 @@ namespace orc { void UnpackAvx512::vectorUnpack6(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 6; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -469,9 +462,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 8 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 8 * bitWidth; numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); @@ -486,11 +478,11 @@ namespace orc { void UnpackAvx512::vectorUnpack7(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 7; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -536,9 +528,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 8 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 8 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 8 * bitWidth; numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); @@ -553,11 +544,11 @@ namespace orc { void UnpackAvx512::vectorUnpack9(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 9; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -602,9 +593,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -653,9 +643,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -670,11 +659,11 @@ namespace orc { void UnpackAvx512::vectorUnpack10(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 10; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -708,9 +697,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -725,11 +713,11 @@ namespace orc { void UnpackAvx512::vectorUnpack11(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 11; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -783,9 +771,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -834,9 +821,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -851,11 +837,11 @@ namespace orc { void UnpackAvx512::vectorUnpack12(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 12; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -889,9 +875,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -906,11 +891,11 @@ namespace orc { void UnpackAvx512::vectorUnpack13(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 13; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -964,9 +949,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -1015,9 +999,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -1032,11 +1015,11 @@ namespace orc { void UnpackAvx512::vectorUnpack14(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 14; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -1082,9 +1065,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -1099,11 +1081,11 @@ namespace orc { void UnpackAvx512::vectorUnpack15(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 15; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -1157,9 +1139,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -1208,9 +1189,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, zmm[0]); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -1225,10 +1205,10 @@ namespace orc { void UnpackAvx512::vectorUnpack16(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 16; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = len; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); int64_t* dstPtr = data + offset; bool resetBuf = false; uint64_t tailBitLen = 0; @@ -1261,9 +1241,8 @@ namespace orc { _mm512_storeu_si512(simdPtr, srcmm); srcPtr += 4 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 4 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 4 * bitWidth; numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); @@ -1274,43 +1253,37 @@ namespace orc { if (numElements > 0) { bufMoveByteLen -= moveByteLen(numElements * bitWidth); unpackDefault.unrolledUnpack16(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(decoder->bufferStart); + srcPtr = reinterpret_cast(decoder->getBufStart()); dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufRestByteLen = decoder->bufLength(); } if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); + decoder->resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); return; } + decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - unpackDefault.unrolledUnpack16(dstPtr, 0, 1); dstPtr++; backupByteLen = 0; len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); } - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufRestByteLen = decoder->bufLength(); bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + srcPtr = reinterpret_cast(decoder->getBufStart()); } } void UnpackAvx512::vectorUnpack17(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 17; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -1354,9 +1327,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1405,9 +1377,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1422,11 +1393,11 @@ namespace orc { void UnpackAvx512::vectorUnpack18(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 18; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -1470,9 +1441,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1521,9 +1491,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1538,11 +1507,11 @@ namespace orc { void UnpackAvx512::vectorUnpack19(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 19; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -1586,9 +1555,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1637,9 +1605,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1654,11 +1621,11 @@ namespace orc { void UnpackAvx512::vectorUnpack20(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 20; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -1691,9 +1658,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1708,11 +1674,11 @@ namespace orc { void UnpackAvx512::vectorUnpack21(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 21; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -1756,9 +1722,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1807,9 +1772,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1824,11 +1788,11 @@ namespace orc { void UnpackAvx512::vectorUnpack22(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 22; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -1872,9 +1836,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1923,9 +1886,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -1940,11 +1902,11 @@ namespace orc { void UnpackAvx512::vectorUnpack23(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 23; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; @@ -1989,9 +1951,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -2040,9 +2001,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -2057,11 +2017,11 @@ namespace orc { void UnpackAvx512::vectorUnpack24(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 24; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t tailBitLen = 0; uint32_t backupByteLen = 0; @@ -2101,9 +2061,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -2114,43 +2073,37 @@ namespace orc { if (numElements > 0) { bufMoveByteLen -= moveByteLen(numElements * bitWidth); unpackDefault.unrolledUnpack24(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(decoder->bufferStart); + srcPtr = reinterpret_cast(decoder->getBufStart()); dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufRestByteLen = decoder->bufLength(); } if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); + decoder->resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); return; } + decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - ; unpackDefault.unrolledUnpack24(dstPtr, 0, 1); dstPtr++; backupByteLen = 0; len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); } - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufRestByteLen = decoder->bufLength(); bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + srcPtr = reinterpret_cast(decoder->getBufStart()); } } void UnpackAvx512::vectorUnpack26(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 26; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -2194,9 +2147,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -2245,9 +2197,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -2262,11 +2213,11 @@ namespace orc { void UnpackAvx512::vectorUnpack28(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 28; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -2299,9 +2250,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -2316,11 +2266,11 @@ namespace orc { void UnpackAvx512::vectorUnpack30(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 30; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t startBit = 0; uint64_t tailBitLen = 0; @@ -2373,9 +2323,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -2423,9 +2372,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, zmm[0]); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -2440,11 +2388,11 @@ namespace orc { void UnpackAvx512::vectorUnpack32(int64_t* data, uint64_t offset, uint64_t len) { uint32_t bitWidth = 32; - const uint8_t* srcPtr = reinterpret_cast(decoder->bufferStart); + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); uint64_t numElements = 0; int64_t* dstPtr = data + offset; uint64_t bufMoveByteLen = 0; - uint64_t bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + uint64_t bufRestByteLen = decoder->bufLength(); bool resetBuf = false; uint64_t tailBitLen = 0; uint32_t backupByteLen = 0; @@ -2475,9 +2423,8 @@ namespace orc { _mm512_storeu_si512(vectorBuf, srcmm); srcPtr += 2 * bitWidth; - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, 2 * bitWidth, false, - 0); - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); bufMoveByteLen -= 2 * bitWidth; numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); @@ -2488,33 +2435,27 @@ namespace orc { if (numElements > 0) { bufMoveByteLen -= moveByteLen(numElements * bitWidth); unpackDefault.unrolledUnpack32(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(decoder->bufferStart); + srcPtr = reinterpret_cast(decoder->getBufStart()); dstPtr += numElements; - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufRestByteLen = decoder->bufLength(); } if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufMoveByteLen, - resetBuf, backupByteLen); + decoder->resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); return; } + decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); if (backupByteLen != 0) { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); - ; unpackDefault.unrolledUnpack32(dstPtr, 0, 1); dstPtr++; backupByteLen = 0; len--; - } else { - decoder->resetBufferStart(&decoder->bufferStart, &decoder->bufferEnd, bufRestByteLen, - resetBuf, backupByteLen); } - bufRestByteLen = decoder->bufferEnd - decoder->bufferStart; + bufRestByteLen = decoder->bufLength(); bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->bufferStart); + srcPtr = reinterpret_cast(decoder->getBufStart()); } } @@ -2523,22 +2464,22 @@ namespace orc { for (uint64_t i = offset; i < (offset + len); i++) { uint64_t result = 0; uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > decoder->bitsLeft) { - result <<= decoder->bitsLeft; - result |= decoder->curByte & ((1 << decoder->bitsLeft) - 1); - bitsLeftToRead -= decoder->bitsLeft; - decoder->curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - decoder->bitsLeft = 8; + while (bitsLeftToRead > decoder->getBitsLeft()) { + result <<= decoder->getBitsLeft(); + result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1); + bitsLeftToRead -= decoder->getBitsLeft(); + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); } // handle the left over bits if (bitsLeftToRead > 0) { result <<= bitsLeftToRead; - decoder->bitsLeft -= static_cast(bitsLeftToRead); - result |= (decoder->curByte >> decoder->bitsLeft) & ((1 << bitsLeftToRead) - 1); + decoder->setBitsLeft(decoder->getBitsLeft() - static_cast(bitsLeftToRead)); + result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1); } data[i] = static_cast(result); - startBit = decoder->bitsLeft == 0 ? 0 : (8 - decoder->bitsLeft); + startBit = decoder->getBitsLeft() == 0 ? 0 : (8 - decoder->getBitsLeft()); } } diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc index 1248a06afb..163a3260a1 100644 --- a/c++/src/BpackingDefault.cc +++ b/c++/src/BpackingDefault.cc @@ -34,18 +34,18 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. - while (decoder->bitsLeft > 0 && curIdx < offset + len) { - decoder->bitsLeft -= 4; - data[curIdx++] = (decoder->curByte >> decoder->bitsLeft) & 15; + while (decoder->getBitsLeft() > 0 && curIdx < offset + len) { + decoder->setBitsLeft(decoder->getBitsLeft() - 4); + data[curIdx++] = (decoder->getCurByte() >> decoder->getBitsLeft()) & 15; } if (curIdx == offset + len) return; // Exhaust the buffer uint64_t numGroups = (offset + len - curIdx) / 2; numGroups = - std::min(numGroups, static_cast(decoder->bufferEnd - decoder->bufferStart)); + std::min(numGroups, static_cast(decoder->bufLength())); // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->getBufStart()); uint32_t localByte; for (uint64_t i = 0; i < numGroups; ++i) { localByte = *buffer++; @@ -53,12 +53,12 @@ namespace orc { data[curIdx + 1] = localByte & 15; curIdx += 2; } - decoder->bufferStart = reinterpret_cast(buffer); + decoder->setBufStart(reinterpret_cast(buffer)); if (curIdx == offset + len) return; // readByte() will update 'bufferStart' and 'bufferEnd' - decoder->curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - decoder->bitsLeft = 8; + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); } } @@ -66,18 +66,18 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = decoder->bufferEnd - decoder->bufferStart; + int64_t bufferNum = decoder->bufLength(); bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->getBufStart()); for (int i = 0; i < bufferNum; ++i) { data[curIdx++] = *buffer++; } - decoder->bufferStart = reinterpret_cast(buffer); + decoder->setBufStart(reinterpret_cast(buffer)); if (curIdx == offset + len) return; // readByte() will update 'bufferStart' and 'bufferEnd'. - data[curIdx++] = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + data[curIdx++] = decoder->readByte(); } } @@ -85,23 +85,23 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 2; + int64_t bufferNum = decoder->bufLength() / 2; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint16_t b0, b1; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); buffer += 2; data[curIdx++] = (b0 << 8) | b1; } - decoder->bufferStart = reinterpret_cast(buffer); + decoder->setBufStart(reinterpret_cast(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b0 = decoder->readByte(); + b1 = decoder->readByte(); data[curIdx++] = (b0 << 8) | b1; } } @@ -110,11 +110,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 3; + int64_t bufferNum = decoder->bufLength() / 3; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint32_t b0, b1, b2; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -122,13 +122,14 @@ namespace orc { buffer += 3; data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); } - decoder->bufferStart += bufferNum * 3; + //////decoder->bufferStart += bufferNum * 3; + decoder->setBufStart(reinterpret_cast(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); } } @@ -137,11 +138,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 4; + int64_t bufferNum = decoder->bufLength() / 4; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint32_t b0, b1, b2, b3; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -150,14 +151,14 @@ namespace orc { buffer += 4; data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); } - decoder->bufferStart = reinterpret_cast(buffer); + decoder->setBufStart(reinterpret_cast(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); } } @@ -166,11 +167,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 5; + int64_t bufferNum = decoder->bufLength() / 5; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -181,15 +182,15 @@ namespace orc { data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); } - decoder->bufferStart = reinterpret_cast(buffer); + decoder->setBufStart(reinterpret_cast(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b4 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); } } @@ -198,11 +199,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 6; + int64_t bufferNum = decoder->bufLength() / 6; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -214,16 +215,16 @@ namespace orc { data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); } - decoder->bufferStart = reinterpret_cast(buffer); + decoder->setBufStart(reinterpret_cast(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b4 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b5 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + b5 = decoder->readByte(); data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); } @@ -233,11 +234,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 7; + int64_t bufferNum = decoder->bufLength() / 7; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5, b6; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -250,17 +251,17 @@ namespace orc { data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); } - decoder->bufferStart = reinterpret_cast(buffer); + decoder->setBufStart(reinterpret_cast(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b4 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b5 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b6 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + b5 = decoder->readByte(); + b6 = decoder->readByte(); data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | (b4 << 16) | (b5 << 8) | b6); } @@ -270,11 +271,11 @@ namespace orc { uint64_t curIdx = offset; while (curIdx < offset + len) { // Exhaust the buffer - int64_t bufferNum = (decoder->bufferEnd - decoder->bufferStart) / 8; + int64_t bufferNum = decoder->bufLength() / 8; bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); uint64_t b0, b1, b2, b3, b4, b5, b6, b7; // Avoid updating 'bufferStart' inside the loop. - auto* buffer = reinterpret_cast(decoder->bufferStart); + auto* buffer = reinterpret_cast(decoder->getBufStart()); for (int i = 0; i < bufferNum; ++i) { b0 = static_cast(*buffer); b1 = static_cast(*(buffer + 1)); @@ -288,18 +289,18 @@ namespace orc { data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); } - decoder->bufferStart = reinterpret_cast(buffer); + decoder->setBufStart(reinterpret_cast(buffer)); if (curIdx == offset + len) return; // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b1 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b2 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b3 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b4 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b5 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b6 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - b7 = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + b5 = decoder->readByte(); + b6 = decoder->readByte(); + b7 = decoder->readByte(); data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); } @@ -309,19 +310,19 @@ namespace orc { for (uint64_t i = offset; i < (offset + len); i++) { uint64_t result = 0; uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > decoder->bitsLeft) { - result <<= decoder->bitsLeft; - result |= decoder->curByte & ((1 << decoder->bitsLeft) - 1); - bitsLeftToRead -= decoder->bitsLeft; - decoder->curByte = decoder->readByte(&decoder->bufferStart, &decoder->bufferEnd); - decoder->bitsLeft = 8; + while (bitsLeftToRead > decoder->getBitsLeft()) { + result <<= decoder->getBitsLeft(); + result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1); + bitsLeftToRead -= decoder->getBitsLeft(); + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); } // handle the left over bits if (bitsLeftToRead > 0) { result <<= bitsLeftToRead; - decoder->bitsLeft -= static_cast(bitsLeftToRead); - result |= (decoder->curByte >> decoder->bitsLeft) & ((1 << bitsLeftToRead) - 1); + decoder->setBitsLeft(decoder->getBitsLeft() - static_cast(bitsLeftToRead)); + result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1); } data[i] = static_cast(result); } diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index 4786f7074d..218aa5fa59 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -166,19 +166,49 @@ namespace orc { void next(int16_t* data, uint64_t numValues, const char* notNull) override; - unsigned char readByte(char** bufStart, char** bufEnd); + unsigned char readByte(); + + void setBufStart(char* start) { + bufferStart = start; + } + + char* getBufStart() { + return bufferStart; + } + + void setBufEnd(char* end) { + bufferEnd = end; + } + + char* getBufEnd() { + return bufferEnd; + } + + uint64_t bufLength() { + return bufferEnd - bufferStart; + } + + void setBitsLeft(uint32_t bits) { + bitsLeft = bits; + } + + void setCurByte(uint32_t byte) { + curByte = byte; + } + + uint32_t getBitsLeft() { + return bitsLeft; + } + + uint32_t getCurByte() { + return curByte; + } /** * Most hotspot of this function locates in saving stack, so inline this function to have * performance gain. */ - inline void resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, bool resetBuf, - uint32_t backupLen); - - char* bufferStart; - char* bufferEnd; - uint32_t bitsLeft; // Used by readLongs when bitSize < 8 - uint32_t curByte; // Used by anything that uses readLongs + inline void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); private: /** @@ -222,15 +252,19 @@ namespace orc { const std::unique_ptr inputStream; const bool isSigned; unsigned char firstByte; + char* bufferStart; + char* bufferEnd; uint64_t runLength; // Length of the current run uint64_t runRead; // Number of returned values of the current run + uint32_t bitsLeft; // Used by readLongs when bitSize < 8 + uint32_t curByte; // Used by anything that uses readLongs DataBuffer unpackedPatch; // Used by PATCHED_BASE DataBuffer literals; // Values of the current run }; - inline void RleDecoderV2::resetBufferStart(char** bufStart, char** bufEnd, uint64_t len, - bool resetBuf, uint32_t backupByteLen) { - uint64_t remainingLen = *bufEnd - *bufStart; + inline void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { + char* bufStart = getBufStart(); + uint64_t remainingLen = bufLength(); int bufferLength = 0; const void* bufferPointer = nullptr; @@ -245,10 +279,10 @@ namespace orc { } if (bufferPointer == nullptr) { - *bufStart += len; + setBufStart(bufStart + len); } else { - *bufStart = const_cast(static_cast(bufferPointer)); - *bufEnd = *bufStart + bufferLength; + setBufStart(const_cast(static_cast(bufferPointer))); + setBufEnd(const_cast(static_cast(bufferPointer)) + bufferLength); } } } // namespace orc diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index f1eff52076..59531cb76a 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -29,19 +29,19 @@ namespace orc { - unsigned char RleDecoderV2::readByte(char** bufStart, char** bufEnd) { + unsigned char RleDecoderV2::readByte() { SCOPED_MINUS_STOPWATCH(metrics, DecodingLatencyUs); - if (*bufStart == *bufEnd) { + if (bufferStart == bufferEnd) { int bufferLength; const void* bufferPointer; if (!inputStream->Next(&bufferPointer, &bufferLength)) { throw ParseError("bad read in RleDecoderV2::readByte"); } - *bufStart = const_cast(static_cast(bufferPointer)); - *bufEnd = *bufStart + bufferLength; + bufferStart = const_cast(static_cast(bufferPointer)); + bufferEnd = bufferStart + bufferLength; } - unsigned char result = static_cast(*(*bufStart)++); + unsigned char result = static_cast(*bufferStart++); return result; } @@ -50,7 +50,7 @@ namespace orc { uint64_t n = bsz; while (n > 0) { n--; - val = readByte(&bufferStart, &bufferEnd); + val = readByte(); ret |= (val << (n * 8)); } return ret; @@ -64,7 +64,7 @@ namespace orc { uint64_t ret = 0, b; uint64_t offset = 0; do { - b = readByte(&bufferStart, &bufferEnd); + b = readByte(); ret |= (0x7f & b) << offset; offset += 7; } while (b >= 0x80); @@ -92,15 +92,15 @@ namespace orc { RleDecoderV2::RleDecoderV2(std::unique_ptr input, bool _isSigned, MemoryPool& pool, ReaderMetrics* _metrics) : RleDecoder(_metrics), - bufferStart(nullptr), - bufferEnd(bufferStart), - bitsLeft(0), - curByte(0), inputStream(std::move(input)), isSigned(_isSigned), firstByte(0), + bufferStart(nullptr), + bufferEnd(bufferStart), runLength(0), runRead(0), + bitsLeft(0), + curByte(0), unpackedPatch(pool, 0), literals(pool, MAX_LITERAL_SIZE) { // PASS @@ -144,7 +144,7 @@ namespace orc { if (runRead == runLength) { resetRun(); - firstByte = readByte(&bufferStart, &bufferEnd); + firstByte = readByte(); } uint64_t offset = nRead, length = numValues - nRead; @@ -231,7 +231,7 @@ namespace orc { // extract the run length runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(&bufferStart, &bufferEnd); + runLength |= readByte(); // runs are one off runLength += 1; runRead = 0; @@ -280,13 +280,13 @@ namespace orc { // extract the run length runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(&bufferStart, &bufferEnd); + runLength |= readByte(); // runs are one off runLength += 1; runRead = 0; // extract the number of bytes occupied by base - uint64_t thirdByte = readByte(&bufferStart, &bufferEnd); + uint64_t thirdByte = readByte(); uint64_t byteSize = (thirdByte >> 5) & 0x07; // base width is one off byteSize += 1; @@ -296,7 +296,7 @@ namespace orc { uint32_t patchBitSize = decodeBitWidth(pwo); // read fourth byte and extract patch gap width - uint64_t fourthByte = readByte(&bufferStart, &bufferEnd); + uint64_t fourthByte = readByte(); uint32_t pgw = (fourthByte >> 5) & 0x07; // patch gap width is one off pgw += 1; @@ -384,7 +384,7 @@ namespace orc { // extract the run length runLength = static_cast(firstByte & 0x01) << 8; - runLength |= readByte(&bufferStart, &bufferEnd); + runLength |= readByte(); ++runLength; // account for first value runRead = 0; From b48ec06c830e45f4879e80ab8ba982545ebf7b45 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 18 Apr 2023 10:37:52 -0400 Subject: [PATCH 71/80] 1. Modified vectorUnpack16,vectorUnpack24,vectorUnpack32 to support alignHeaderBoundary and alignTailerBoundary 2. Solve the conflicts in c++/src/CMakeLists.txt --- c++/src/BpackingAvx512.cc | 411 +++++++++++++++++--------------------- c++/src/BpackingAvx512.hh | 13 +- c++/src/CMakeLists.txt | 1 + 3 files changed, 193 insertions(+), 232 deletions(-) diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc index 506bd814f1..22f6972fb6 100644 --- a/c++/src/BpackingAvx512.cc +++ b/c++/src/BpackingAvx512.cc @@ -30,6 +30,7 @@ namespace orc { // PASS } + template inline void UnpackAvx512::alignHeaderBoundary(const uint32_t bitWidth, const uint32_t bitMaxSize, uint64_t& startBit, uint64_t& bufMoveByteLen, uint64_t& bufRestByteLen, @@ -38,7 +39,7 @@ namespace orc { uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, int64_t*& dstPtr) { uint64_t numBits = remainingNumElements * bitWidth; - if (startBit != 0) { + if (hasBitOffset && startBit != 0) { numBits += startBit - ORC_VECTOR_BYTE_WIDTH; } bufMoveByteLen += moveByteLen(numBits); @@ -49,7 +50,7 @@ namespace orc { remainingNumElements = 0; } else { uint64_t leadingBits = 0; - if (startBit != 0) leadingBits = ORC_VECTOR_BYTE_WIDTH - startBit; + if (hasBitOffset && startBit != 0) leadingBits = ORC_VECTOR_BYTE_WIDTH - startBit; uint64_t bufRestBitLen = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + leadingBits; numElements = bufRestBitLen / bitWidth; remainingNumElements -= numElements; @@ -62,7 +63,7 @@ namespace orc { tailBitLen = 0; } - if (startBit > 0) { + if (hasBitOffset && startBit > 0) { uint32_t align = getAlign(startBit, bitWidth, bitMaxSize); if (align > numElements) { align = numElements; @@ -78,19 +79,37 @@ namespace orc { } } - inline void UnpackAvx512::alignTailerBoundary(const uint32_t bitWidth, uint64_t& startBit, - uint64_t& bufMoveByteLen, uint64_t& bufRestByteLen, + template + inline void UnpackAvx512::alignTailerBoundary(const uint32_t bitWidth, const uint32_t specialBit, + uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, uint64_t& remainingNumElements, uint32_t& backupByteLen, uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, int64_t*& dstPtr) { if (numElements > 0) { uint64_t numBits = numElements * bitWidth; - if (startBit != 0) { + if (hasBitOffset && startBit != 0) { numBits += startBit - ORC_VECTOR_BYTE_WIDTH; } bufMoveByteLen -= moveByteLen(numBits); - plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + if (hasBitOffset) { + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + } else { + switch (specialBit) { + case 16: + unpackDefault.unrolledUnpack16(dstPtr, 0, numElements); + break; + case 24: + unpackDefault.unrolledUnpack24(dstPtr, 0, numElements); + break; + case 32: + unpackDefault.unrolledUnpack32(dstPtr, 0, numElements); + break; + default: + break; + } + } srcPtr = reinterpret_cast(decoder->getBufStart()); dstPtr += numElements; bufRestByteLen = decoder->bufLength(); @@ -103,7 +122,23 @@ namespace orc { decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); if (backupByteLen != 0) { - plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + if (hasBitOffset) { + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + } else { + switch (specialBit) { + case 16: + unpackDefault.unrolledUnpack16(dstPtr, 0, 1); + break; + case 24: + unpackDefault.unrolledUnpack24(dstPtr, 0, 1); + break; + case 32: + unpackDefault.unrolledUnpack32(dstPtr, 0, 1); + break; + default: + break; + } + } dstPtr++; backupByteLen = 0; remainingNumElements--; @@ -127,8 +162,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -153,8 +189,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -171,8 +207,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -219,8 +256,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -237,8 +274,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -285,8 +323,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -303,8 +341,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -339,8 +378,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -357,8 +396,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -405,8 +445,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -423,8 +463,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -471,8 +512,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -489,8 +530,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { uint8_t* simdPtr = reinterpret_cast(vectorBuf); @@ -537,8 +579,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -555,8 +597,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -652,8 +695,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -670,8 +713,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -706,8 +750,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -724,8 +768,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -830,8 +875,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -848,8 +893,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -884,8 +930,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -902,8 +948,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1008,8 +1055,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1026,8 +1073,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1074,8 +1122,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1092,8 +1140,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1198,8 +1247,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1213,24 +1262,12 @@ namespace orc { bool resetBuf = false; uint64_t tailBitLen = 0; uint32_t backupByteLen = 0; + uint64_t startBit = 0; while (len > 0) { - bufMoveByteLen += moveByteLen(len * bitWidth); - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - } else { - numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { uint16_t* simdPtr = reinterpret_cast(vectorBuf); @@ -1250,30 +1287,8 @@ namespace orc { } } - if (numElements > 0) { - bufMoveByteLen -= moveByteLen(numElements * bitWidth); - unpackDefault.unrolledUnpack16(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(decoder->getBufStart()); - dstPtr += numElements; - bufRestByteLen = decoder->bufLength(); - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - if (backupByteLen != 0) { - unpackDefault.unrolledUnpack16(dstPtr, 0, 1); - dstPtr++; - backupByteLen = 0; - len--; - } - - bufRestByteLen = decoder->bufLength(); - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->getBufStart()); + alignTailerBoundary(bitWidth, 16, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1290,8 +1305,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -1386,8 +1402,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1404,8 +1420,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -1500,8 +1517,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1518,8 +1535,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -1614,8 +1632,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1632,8 +1650,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -1667,8 +1686,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1685,8 +1704,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -1781,8 +1801,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1799,8 +1819,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -1895,8 +1916,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -1914,8 +1935,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); @@ -2010,8 +2032,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2025,24 +2047,12 @@ namespace orc { bool resetBuf = false; uint64_t tailBitLen = 0; uint32_t backupByteLen = 0; + uint64_t startBit = 0; while (len > 0) { - bufMoveByteLen += moveByteLen(len * bitWidth); - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - } else { - numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -2070,30 +2080,8 @@ namespace orc { } } - if (numElements > 0) { - bufMoveByteLen -= moveByteLen(numElements * bitWidth); - unpackDefault.unrolledUnpack24(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(decoder->getBufStart()); - dstPtr += numElements; - bufRestByteLen = decoder->bufLength(); - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - if (backupByteLen != 0) { - unpackDefault.unrolledUnpack24(dstPtr, 0, 1); - dstPtr++; - backupByteLen = 0; - len--; - } - - bufRestByteLen = decoder->bufLength(); - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->getBufStart()); + alignTailerBoundary(bitWidth, 24, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2110,8 +2098,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -2206,8 +2195,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2224,8 +2213,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -2259,8 +2249,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2277,8 +2267,9 @@ namespace orc { uint32_t backupByteLen = 0; while (len > 0) { - alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, bufRestByteLen, - len, tailBitLen, backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); @@ -2381,8 +2372,8 @@ namespace orc { } } - alignTailerBoundary(bitWidth, startBit, bufMoveByteLen, bufRestByteLen, len, backupByteLen, - numElements, resetBuf, srcPtr, dstPtr); + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2396,24 +2387,12 @@ namespace orc { bool resetBuf = false; uint64_t tailBitLen = 0; uint32_t backupByteLen = 0; + uint64_t startBit = 0; while (len > 0) { - bufMoveByteLen += moveByteLen(len * bitWidth); - - if (bufMoveByteLen <= bufRestByteLen) { - numElements = len; - resetBuf = false; - } else { - numElements = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH / bitWidth; - len -= numElements; - tailBitLen = fmod(bufRestByteLen * ORC_VECTOR_BYTE_WIDTH, bitWidth); - resetBuf = true; - } - - if (tailBitLen != 0) { - backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; - tailBitLen = 0; - } + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); @@ -2432,30 +2411,8 @@ namespace orc { } } - if (numElements > 0) { - bufMoveByteLen -= moveByteLen(numElements * bitWidth); - unpackDefault.unrolledUnpack32(dstPtr, 0, numElements); - srcPtr = reinterpret_cast(decoder->getBufStart()); - dstPtr += numElements; - bufRestByteLen = decoder->bufLength(); - } - - if (bufMoveByteLen <= bufRestByteLen) { - decoder->resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); - return; - } - - decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); - if (backupByteLen != 0) { - unpackDefault.unrolledUnpack32(dstPtr, 0, 1); - dstPtr++; - backupByteLen = 0; - len--; - } - - bufRestByteLen = decoder->bufLength(); - bufMoveByteLen = 0; - srcPtr = reinterpret_cast(decoder->getBufStart()); + alignTailerBoundary(bitWidth, 32, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); } } @@ -2468,14 +2425,14 @@ namespace orc { result <<= decoder->getBitsLeft(); result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1); bitsLeftToRead -= decoder->getBitsLeft(); - decoder->setCurByte(decoder->readByte()); - decoder->setBitsLeft(8); + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); } // handle the left over bits if (bitsLeftToRead > 0) { result <<= bitsLeftToRead; - decoder->setBitsLeft(decoder->getBitsLeft() - static_cast(bitsLeftToRead)); + decoder->setBitsLeft(decoder->getBitsLeft() - static_cast(bitsLeftToRead)); result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1); } data[i] = static_cast(result); diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index aad178b13c..bd7f98f577 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -21,6 +21,7 @@ #include #include +#include #include "BpackingDefault.hh" @@ -71,6 +72,7 @@ namespace orc { void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, uint64_t& startBit); + template inline void alignHeaderBoundary(const uint32_t bitWidth, const uint32_t bitMaxSize, uint64_t& startBit, uint64_t& bufMoveByteLen, uint64_t& bufRestByteLen, uint64_t& remainingNumElements, @@ -78,11 +80,12 @@ namespace orc { uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, int64_t*& dstPtr); - inline void alignTailerBoundary(const uint32_t bitWidth, uint64_t& startBit, - uint64_t& bufMoveByteLen, uint64_t& bufRestByteLen, - uint64_t& remainingNumElements, uint32_t& backupByteLen, - uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, - int64_t*& dstPtr); + template + inline void alignTailerBoundary(const uint32_t bitWidth, const uint32_t specialBit, + uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, uint64_t& remainingNumElements, + uint32_t& backupByteLen, uint64_t& numElements, bool& resetBuf, + const uint8_t*& srcPtr, int64_t*& dstPtr); private: RleDecoderV2* decoder; diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index a2f7be3b62..0c90eac81d 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -168,6 +168,7 @@ set(SOURCE_FILES ColumnWriter.cc Common.cc Compression.cc + ConvertColumnReader.cc CpuInfoUtil.cc Exceptions.cc Int128.cc From b89870adccf4c5e74f6779f5569db749487c308d Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 18 Apr 2023 11:24:41 -0400 Subject: [PATCH 72/80] Added the comments of function alignHeaderBoundary and alignTailerBoundary --- c++/src/BpackingAvx512.hh | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index bd7f98f577..aa1784bad0 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -72,6 +72,26 @@ namespace orc { void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, uint64_t& startBit); + /** + * In the processing of AVX512 unpacking, AVX512 instructions can only process the memory align data. + * It means that if data input is not memory align (@param startBit != 0), we need to process the + * unaligned data. After that, it could be use AVX512 instructions to process these memory align data. + * + * @tparam hasBitOffset If currently processed data has offset bits in one Byte, 8X-bit width data will + * not have bits offset in one Byte, so it will be false. For other bits data, it will be true. + * @param bitWidth The unpacking data bit width + * @param bitMaxSize The unpacking data needs the Max bit size (8X) + * @param startBit The start bit position in one Byte + * @param bufMoveByteLen In the current buffer, it will be processed/moved Bytes length in the unpacking + * @param bufRestByteLen In the current buffer, there will be some rest Bytes length after unpacking + * @param remainingNumElements After unpacking, the remaining elements number need to be processed + * @param tailBitLen After unpacking, the tail bits length + * @param backupByteLen The backup Byte length after unpacking + * @param numElements Currently, the number of elements need to be processed + * @param resetBuf When the current buffer has already been processed, it need to be reset the buffer + * @param srcPtr the pointer of source data + * @param dstPtr the pointer of destinative data + */ template inline void alignHeaderBoundary(const uint32_t bitWidth, const uint32_t bitMaxSize, uint64_t& startBit, uint64_t& bufMoveByteLen, @@ -80,6 +100,23 @@ namespace orc { uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, int64_t*& dstPtr); + /** + * After AVX512 unpacking processed, there could be some scattered data not be process, + * it needs to be processed by the default way. + * + * @tparam hasBitOffset If currently processed data has offset bits in one Byte, 8X-bit width data will + * not have bits offset in one Byte, so it will be false. For other bits data, it will be true. + * @param bitWidth The unpacking data bit width + * @param startBit The start bit position in one Byte + * @param bufMoveByteLen In the current buffer, it will be processed/moved Bytes length in the unpacking + * @param bufRestByteLen In the current buffer, there will be some rest Bytes length after unpacking + * @param remainingNumElements After unpacking, the remaining elements number need to be processed + * @param backupByteLen The backup Byte length after unpacking + * @param numElements Currently, the number of elements need to be processed + * @param resetBuf When the current buffer has already been processed, it need to be reset the buffer + * @param srcPtr the pointer of source data + * @param dstPtr the pointer of destinative data + */ template inline void alignTailerBoundary(const uint32_t bitWidth, const uint32_t specialBit, uint64_t& startBit, uint64_t& bufMoveByteLen, From fe09a92994a3158a14c8b4053e95cfdf712c6fa5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 18 Apr 2023 11:45:05 -0400 Subject: [PATCH 73/80] Delete useless header file --- c++/src/BpackingAvx512.hh | 1 - 1 file changed, 1 deletion(-) diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index aa1784bad0..7849891b1b 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -21,7 +21,6 @@ #include #include -#include #include "BpackingDefault.hh" From e236773e805b873be87648cb487a6ea7176b42b1 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 18 Apr 2023 16:29:23 -0400 Subject: [PATCH 74/80] Code format change --- c++/src/BpackingAvx512.hh | 41 +++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index 7849891b1b..bc2ba5df3d 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -72,22 +72,28 @@ namespace orc { uint64_t& startBit); /** - * In the processing of AVX512 unpacking, AVX512 instructions can only process the memory align data. - * It means that if data input is not memory align (@param startBit != 0), we need to process the - * unaligned data. After that, it could be use AVX512 instructions to process these memory align data. + * In the processing of AVX512 unpacking, AVX512 instructions can only process the memory align + * data. It means that if data input is not memory align (@param startBit != 0), we need to + * process the unaligned data. After that, it could be use AVX512 instructions to process these + * memory align data. * - * @tparam hasBitOffset If currently processed data has offset bits in one Byte, 8X-bit width data will - * not have bits offset in one Byte, so it will be false. For other bits data, it will be true. + * @tparam hasBitOffset If currently processed data has offset bits in one Byte, 8X-bit width + * data will not have bits offset in one Byte, so it will be false. For other bits data, it will + * be true. * @param bitWidth The unpacking data bit width * @param bitMaxSize The unpacking data needs the Max bit size (8X) * @param startBit The start bit position in one Byte - * @param bufMoveByteLen In the current buffer, it will be processed/moved Bytes length in the unpacking - * @param bufRestByteLen In the current buffer, there will be some rest Bytes length after unpacking - * @param remainingNumElements After unpacking, the remaining elements number need to be processed + * @param bufMoveByteLen In the current buffer, it will be processed/moved Bytes length in the + * unpacking + * @param bufRestByteLen In the current buffer, there will be some rest Bytes length after + * unpacking + * @param remainingNumElements After unpacking, the remaining elements number need to be + * processed * @param tailBitLen After unpacking, the tail bits length * @param backupByteLen The backup Byte length after unpacking * @param numElements Currently, the number of elements need to be processed - * @param resetBuf When the current buffer has already been processed, it need to be reset the buffer + * @param resetBuf When the current buffer has already been processed, it need to be reset the + * buffer * @param srcPtr the pointer of source data * @param dstPtr the pointer of destinative data */ @@ -103,16 +109,21 @@ namespace orc { * After AVX512 unpacking processed, there could be some scattered data not be process, * it needs to be processed by the default way. * - * @tparam hasBitOffset If currently processed data has offset bits in one Byte, 8X-bit width data will - * not have bits offset in one Byte, so it will be false. For other bits data, it will be true. + * @tparam hasBitOffset If currently processed data has offset bits in one Byte, 8X-bit width + * data will not have bits offset in one Byte, so it will be false. For other bits data, it will + * be true. * @param bitWidth The unpacking data bit width * @param startBit The start bit position in one Byte - * @param bufMoveByteLen In the current buffer, it will be processed/moved Bytes length in the unpacking - * @param bufRestByteLen In the current buffer, there will be some rest Bytes length after unpacking - * @param remainingNumElements After unpacking, the remaining elements number need to be processed + * @param bufMoveByteLen In the current buffer, it will be processed/moved Bytes length in the + * unpacking + * @param bufRestByteLen In the current buffer, there will be some rest Bytes length after + * unpacking + * @param remainingNumElements After unpacking, the remaining elements number need to be + * processed * @param backupByteLen The backup Byte length after unpacking * @param numElements Currently, the number of elements need to be processed - * @param resetBuf When the current buffer has already been processed, it need to be reset the buffer + * @param resetBuf When the current buffer has already been processed, it need to be reset the + * buffer * @param srcPtr the pointer of source data * @param dstPtr the pointer of destinative data */ From df6fe45d04f57893fa40ebb5c3080e38347c3917 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 19 Apr 2023 08:41:48 -0400 Subject: [PATCH 75/80] Add a parameter comments --- c++/src/BpackingAvx512.hh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh index bc2ba5df3d..7197b67d4d 100644 --- a/c++/src/BpackingAvx512.hh +++ b/c++/src/BpackingAvx512.hh @@ -113,6 +113,8 @@ namespace orc { * data will not have bits offset in one Byte, so it will be false. For other bits data, it will * be true. * @param bitWidth The unpacking data bit width + * @param specialBit 8X bit width data is the specialBit, they have the different unpackDefault + * functions with others * @param startBit The start bit position in one Byte * @param bufMoveByteLen In the current buffer, it will be processed/moved Bytes length in the * unpacking From f3ff21551e3fc84b621827859e8bd124a246ad76 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 21 Apr 2023 09:59:56 -0400 Subject: [PATCH 76/80] Change the invoking way about bufferstart,bufferend parameters. --- c++/src/RLEv2.hh | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index 218aa5fa59..1cee59d0a6 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -168,16 +168,16 @@ namespace orc { unsigned char readByte(); - void setBufStart(char* start) { - bufferStart = start; + void setBufStart(const char* start) { + bufferStart = const_cast(start); } char* getBufStart() { return bufferStart; } - void setBufEnd(char* end) { - bufferEnd = end; + void setBufEnd(const char* end) { + bufferEnd = const_cast(end); } char* getBufEnd() { @@ -188,11 +188,11 @@ namespace orc { return bufferEnd - bufferStart; } - void setBitsLeft(uint32_t bits) { + void setBitsLeft(const uint32_t bits) { bitsLeft = bits; } - void setCurByte(uint32_t byte) { + void setCurByte(const uint32_t byte) { curByte = byte; } @@ -256,14 +256,13 @@ namespace orc { char* bufferEnd; uint64_t runLength; // Length of the current run uint64_t runRead; // Number of returned values of the current run - uint32_t bitsLeft; // Used by readLongs when bitSize < 8 - uint32_t curByte; // Used by anything that uses readLongs + uint32_t bitsLeft; // Used by readLongs when bitSize < 8 + uint32_t curByte; // Used by anything that uses readLongs DataBuffer unpackedPatch; // Used by PATCHED_BASE DataBuffer literals; // Values of the current run }; inline void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { - char* bufStart = getBufStart(); uint64_t remainingLen = bufLength(); int bufferLength = 0; const void* bufferPointer = nullptr; @@ -279,10 +278,10 @@ namespace orc { } if (bufferPointer == nullptr) { - setBufStart(bufStart + len); + bufferStart += len; } else { - setBufStart(const_cast(static_cast(bufferPointer))); - setBufEnd(const_cast(static_cast(bufferPointer)) + bufferLength); + bufferStart = const_cast(static_cast(bufferPointer)); + bufferEnd = bufferStart + bufferLength; } } } // namespace orc From af96de941684b24a78c26b26e36d4daebd94744d Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 21 Apr 2023 21:04:09 -0400 Subject: [PATCH 77/80] 1. Code format change 2. Fix an AVX512 flags check issue on windows. --- c++/src/BpackingDefault.cc | 11 +++++------ c++/src/RleDecoderV2.cc | 4 ++-- cmake_modules/ConfigSimdLevel.cmake | 3 +-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc index 163a3260a1..5a80bc6fb1 100644 --- a/c++/src/BpackingDefault.cc +++ b/c++/src/BpackingDefault.cc @@ -35,15 +35,14 @@ namespace orc { while (curIdx < offset + len) { // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. while (decoder->getBitsLeft() > 0 && curIdx < offset + len) { - decoder->setBitsLeft(decoder->getBitsLeft() - 4); + decoder->setBitsLeft(decoder->getBitsLeft() - 4); data[curIdx++] = (decoder->getCurByte() >> decoder->getBitsLeft()) & 15; } if (curIdx == offset + len) return; // Exhaust the buffer uint64_t numGroups = (offset + len - curIdx) / 2; - numGroups = - std::min(numGroups, static_cast(decoder->bufLength())); + numGroups = std::min(numGroups, static_cast(decoder->bufLength())); // Avoid updating 'bufferStart' inside the loop. auto* buffer = reinterpret_cast(decoder->getBufStart()); uint32_t localByte; @@ -314,14 +313,14 @@ namespace orc { result <<= decoder->getBitsLeft(); result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1); bitsLeftToRead -= decoder->getBitsLeft(); - decoder->setCurByte(decoder->readByte()); - decoder->setBitsLeft(8); + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); } // handle the left over bits if (bitsLeftToRead > 0) { result <<= bitsLeftToRead; - decoder->setBitsLeft(decoder->getBitsLeft() - static_cast(bitsLeftToRead)); + decoder->setBitsLeft(decoder->getBitsLeft() - static_cast(bitsLeftToRead)); result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1); } data[i] = static_cast(result); diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 59531cb76a..c03294ecf1 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -95,11 +95,11 @@ namespace orc { inputStream(std::move(input)), isSigned(_isSigned), firstByte(0), - bufferStart(nullptr), + bufferStart(nullptr), bufferEnd(bufferStart), runLength(0), runRead(0), - bitsLeft(0), + bitsLeft(0), curByte(0), unpackedPatch(pool, 0), literals(pool, MAX_LITERAL_SIZE) { diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 9a82d82c9d..e198573828 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -37,14 +37,13 @@ if(ORC_CPU_FLAG STREQUAL "x86") # x86/amd64 compiler flags, msvc/gcc/clang if(MSVC) set(ORC_AVX512_FLAG "/arch:AVX512") - check_cxx_compiler_flag(${ORC_AVX512_FLAG} COMPILER_SUPPORT_AVX512) else() # "arch=native" selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. # Using -march=native enables all instruction subsets supported by the local machine. # Using -mtune=native produces code optimized for the local machine under the constraints of the selected instruction set. set(ORC_AVX512_FLAG "-march=native -mtune=native") - check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) endif() + check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) if(MINGW) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 From 0bfc862a6d7a58cda4755673d7c08fe8efc2028f Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 23 Apr 2023 11:58:11 -0400 Subject: [PATCH 78/80] Modified cmakefile about the checking of AVX512. --- cmake_modules/ConfigSimdLevel.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index e198573828..113302fb35 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +INCLUDE(CheckCXXSourceRuns) INCLUDE(CheckCXXCompilerFlag) message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") @@ -37,13 +38,14 @@ if(ORC_CPU_FLAG STREQUAL "x86") # x86/amd64 compiler flags, msvc/gcc/clang if(MSVC) set(ORC_AVX512_FLAG "/arch:AVX512") + check_cxx_compiler_flag(${ORC_AVX512_FLAG} COMPILER_SUPPORT_AVX512) else() # "arch=native" selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. # Using -march=native enables all instruction subsets supported by the local machine. # Using -mtune=native produces code optimized for the local machine under the constraints of the selected instruction set. set(ORC_AVX512_FLAG "-march=native -mtune=native") + check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) endif() - check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) if(MINGW) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 @@ -52,7 +54,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") # Check for AVX512 support in the compiler. set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ORC_AVX512_FLAG}") - CHECK_CXX_SOURCE_COMPILES(" + check_cxx_source_runs(" #ifdef _MSC_VER #include #else From e584a4296d874a0d28e9b389ab7a70ece0a3a6fe Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 23 Apr 2023 22:07:28 -0400 Subject: [PATCH 79/80] Because check_cxx_source_run will be hung on windows, change check_cxx_source_run back CHECK_CXX_SOURCE_COMPILES, and added "grep avx512f /proc/cpuinfo" to check CPU flags. --- cmake_modules/ConfigSimdLevel.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 113302fb35..2072cc3268 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -76,10 +76,13 @@ if(ORC_CPU_FLAG STREQUAL "x86") COMMAND head -1 OUTPUT_VARIABLE flags_ver) message(STATUS "CPU ${flags_ver}") + execute_process(COMMAND grep avx512f /proc/cpuinfo + COMMAND head -1 + OUTPUT_VARIABLE CPU_HAS_AVX512) endif() # Runtime SIMD level it can get from compiler - if(CXX_SUPPORTS_AVX512 AND COMPILER_SUPPORT_AVX512) + if(CPU_HAS_AVX512 AND CXX_SUPPORTS_AVX512 AND COMPILER_SUPPORT_AVX512) message(STATUS "Enabled the AVX512 for RLE bit-unpacking") set(ORC_SIMD_LEVEL "AVX512") add_definitions(-DORC_HAVE_RUNTIME_AVX512) From 4d261eb7e28fa939e6d6e02523ff926a91c271d9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 23 Apr 2023 22:11:00 -0400 Subject: [PATCH 80/80] Change check_cxx_source_runs back to CHECK_CXX_SOURCE_COMPILES --- cmake_modules/ConfigSimdLevel.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake index 2072cc3268..86608e63b5 100644 --- a/cmake_modules/ConfigSimdLevel.cmake +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -54,7 +54,7 @@ if(ORC_CPU_FLAG STREQUAL "x86") # Check for AVX512 support in the compiler. set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ORC_AVX512_FLAG}") - check_cxx_source_runs(" + CHECK_CXX_SOURCE_COMPILES(" #ifdef _MSC_VER #include #else