Use first bad_words as extra parameters, and implement min-p

pathorn · pathorn · commit 0481a362b630 · 2024-06-05T17:00:23.000Z
diff --git a/cpp/include/tensorrt_llm/runtime/decodingInput.h b/cpp/include/tensorrt_llm/runtime/decodingInput.h
@@ -30,7 +30,7 @@ class DecodingInput
     using TensorPtr = std::shared_ptr<ITensor const>;
 
     DecodingInput(SizeType32 maxLength, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength,
-        SizeType32 maxBatchSize, TensorPtr logits, TensorPtr endIds)
+        SizeType32 maxBatchSize, TensorPtr logits, TensorPtr endIds, TensorPtr minP)
         : step{maxLength}
         , maxLength{maxLength}
         , maxAttentionWindow{maxAttentionWindow}
@@ -40,6 +40,7 @@ class DecodingInput
         , maxBadWordsLen{0}
         , logits{std::move(logits)}
         , endIds{std::move(endIds)}
+        , minP{std::move(minP)}
     {
         TLLM_CHECK_WITH_INFO(static_cast<bool>(this->logits), "Invalid logits tensor");
         TLLM_CHECK_WITH_INFO(static_cast<bool>(this->endIds), "Invalid endIds tensor");
@@ -57,6 +58,7 @@ class DecodingInput
     std::optional<std::vector<TensorPtr>>
         logitsVec;    // vector of size [batchSize] contains logits of size [beamWidth, vocabSizePadded], on gpu
     TensorPtr endIds; // [maxBatchSize * beamWidth], on gpu
+    TensorPtr minP;   // [maxBatchSize * beamWidth], on gpu
 
     // optional parameters
     TensorPtr finished;            // [maxBatchSize, beamWidth], finished states at current iteration.
diff --git a/cpp/tensorrt_llm/kernels/banBadWords.cu b/cpp/tensorrt_llm/kernels/banBadWords.cu
@@ -31,7 +31,7 @@ __global__ void ban_bad_words(T* logits, TokenIdType const** output_ids_ptr, Siz
     SizeType32 const* bad_words_lens, SizeType32 vocab_size_padded, SizeType32 const* sequence_lengths,
     SizeType32 max_seq_len)
 {
-    auto const id = blockIdx.x * blockDim.x + threadIdx.x;
+    auto const id = blockIdx.x * blockDim.x + threadIdx.x + 1;
     auto const batch_idx = blockIdx.y / beam_width;
     auto const beam_idx = blockIdx.y % beam_width;
     auto const batch_slot = batch_slots != nullptr ? batch_slots[batch_idx] : batch_idx;
diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.h b/cpp/tensorrt_llm/kernels/beamSearchKernels.h
@@ -52,6 +52,7 @@ struct BeamHypotheses
     // Pointers from input
     int const* inputLengths{nullptr};   // [BS, BM]         %% context_length
     int const* endIds{nullptr};         // [BS, BM]         %% self.end_ids
+    float const* minP{nullptr};         // [BS, BM]         %% self.min_p
 
     // Pointers for output
     int* outputIds{nullptr};            // [BS, BM, MSL]    %% self.output_ids                      only used in gather_tree
diff --git a/cpp/tensorrt_llm/kernels/decodingCommon.cu b/cpp/tensorrt_llm/kernels/decodingCommon.cu
@@ -66,7 +66,8 @@ void invokeCurandBatchInitialize(curandState_t* states, int const* batchSlots, c
 template <typename T>
 __global__ void addBiasSoftMax(T* logits, T** logitsPtrs, T* probs, T const* bias, int32_t const* endIds,
     FinishedState const* finished, int32_t const* batchSlots, int32_t batchSize, int32_t maxBatchSize,
-    int32_t beamWidth, int32_t vocabSize, int32_t vocabSizePadded, bool skipSoftMax, bool batchSlotsLogits)
+    int32_t beamWidth, int32_t vocabSize, int32_t vocabSizePadded, bool skipSoftMax, bool batchSlotsLogits,
+    float const* minPs)
 {
     auto const batchIdx = blockIdx.x;
     auto const beamIdx = blockIdx.y;
@@ -114,6 +115,12 @@ __global__ void addBiasSoftMax(T* logits, T** logitsPtrs, T* probs, T const* bia
         logitsPtr[tid] = logit;
     }
 
+    float minP = 0.0f;
+    if (minPs != nullptr)
+    {
+        minP = minPs[batchSlot];
+    }
+
     if (!skipSoftMax)
     {
         maxVal = blockReduceMax<float>((float) maxVal);
@@ -123,10 +130,18 @@ __global__ void addBiasSoftMax(T* logits, T** logitsPtrs, T* probs, T const* bia
         }
         __syncthreads();
 
+        // min_p : probability of token proportional to the max token
+        // compare min_p against exp(logit - maxVal) / exp(maxVal - maxVal) = exp(logit - maxVal)
+
         float sumVal = 0.0f;
         for (int tid = threadIdx.x; tid < vocabSizePadded; tid += blockDim.x)
         {
-            probs[offset + tid] = __expf((float) logitsPtr[tid] - sMaxVal);
+            float rel_prob = __expf((float) logitsPtr[tid] - sMaxVal);
+            if (rel_prob < minP) {
+                rel_prob = 0.0;
+                logitsPtr[tid] = -MAX_T_VAL;
+            }
+            probs[offset + tid] = rel_prob;
             sumVal += (float) probs[offset + tid];
         }
 
@@ -148,7 +163,7 @@ template <typename T>
 void invokeAddBiasSoftMax(T* logits, T** logitsPtrs, T* probs, T const* bias, int32_t const* endIds,
     FinishedState const* finished, int32_t const* batchSlots, int32_t batchSize, int32_t maxBatchSize,
     int32_t beamWidth, int32_t vocabSize, int32_t vocabSizePadded, bool skipSoftMax, bool batchSlotsLogits,
-    cudaStream_t stream)
+    float const* minPs, cudaStream_t stream)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -157,20 +172,20 @@ void invokeAddBiasSoftMax(T* logits, T** logitsPtrs, T* probs, T const* bias, in
     dim3 block(min(vocabRoundedToWarp, 1024));
     // vocabSize, e.g., 30000, 7000.... vocabSize is usually very big.
     addBiasSoftMax<<<grid, block, 0, stream>>>(logits, logitsPtrs, probs, bias, endIds, finished, batchSlots, batchSize,
-        maxBatchSize, beamWidth, vocabSize, vocabSizePadded, skipSoftMax, batchSlotsLogits);
+        maxBatchSize, beamWidth, vocabSize, vocabSizePadded, skipSoftMax, batchSlotsLogits, minPs);
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
 template void invokeAddBiasSoftMax(float* logits, float** logitsPtrs, float* probs, float const* bias,
     int32_t const* endIds, FinishedState const* finished, int32_t const* batchSlots, int32_t batchSize,
     int32_t maxBatchSize, int32_t beamWidth, int32_t vocabSize, int32_t vocabSizePadded, bool skipSoftMax,
-    bool batchSlotsLogits, cudaStream_t stream);
+    bool batchSlotsLogits, float const* minPs, cudaStream_t stream);
 
 template void invokeAddBiasSoftMax(half* logits, half** logitsPtrs, half* probs, half const* bias,
     int32_t const* endIds, FinishedState const* finished, int32_t const* batchSlots, int32_t batchSize,
     int32_t maxBatchSize, int32_t beamWidth, int32_t vocabSize, int32_t vocabSizePadded, bool skipSoftMax,
-    bool batchSlotsLogits, cudaStream_t stream);
+    bool batchSlotsLogits, float const* minPs, cudaStream_t stream);
 
 template <typename T>
 __global__ void scatterDecodingParamsKernel(T const* src, T* dst, int const* batchSlots, int batchSize)
diff --git a/cpp/tensorrt_llm/kernels/decodingCommon.h b/cpp/tensorrt_llm/kernels/decodingCommon.h
@@ -185,12 +185,13 @@ void invokeCurandBatchInitialize(curandState_t* states, int const* batchSlots, c
 //! \param vocabSizePadded padded vocab size
 //! \param skipSoftMax flag to skip softmax computation
 //! \param batchSlotsLogits flag to use batchSlot as index for logits and probs
+//! \param minPs input buffer [maxBatchSize]. minimum ratio of probability to maximum probability for token consideration.
 //! \param stream stream
 template <typename T>
 void invokeAddBiasSoftMax(T* logits, T** logitsPtrs, T* probs, T const* bias, int32_t const* endIds,
     FinishedState const* finished, int32_t const* batchSlots, int32_t batchSize, int32_t maxBatchSize,
     int32_t beamWidth, int32_t vocabSize, int32_t vocabSizePadded, bool skipSoftMax, bool batchSlotsLogits,
-    cudaStream_t stream);
+    float const* minPs, cudaStream_t stream);
 
 //! \brief Distributes values located in src to dst according to the indieces from batchSlots
 //!
diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.cu b/cpp/tensorrt_llm/kernels/decodingKernels.cu
@@ -758,11 +758,11 @@ void acceptDraftTokensByLogits(T* draftLogits, T** targetLogits, T* draftProbs,
         invokeAddBiasSoftMax(draftLogits, static_cast<T**>(nullptr), draftProbs, static_cast<T*>(nullptr), nullptr,
             finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded,
             /* skip softmax */ false,
-            /* batchSlotLogits */ true, stream);
+            /* batchSlotLogits */ true, (float*) (nullptr), stream);
         invokeAddBiasSoftMax(static_cast<T*>(nullptr), targetLogits, targetProbs, static_cast<T*>(nullptr), nullptr,
             finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded,
             /* skip softmax */ false,
-            /* batchSlotLogits */ true, stream);
+            /* batchSlotLogits */ true, (float*) (nullptr), stream);
     }
     {
         dim3 block(1024);
diff --git a/cpp/tensorrt_llm/layers/beamSearchLayer.cu b/cpp/tensorrt_llm/layers/beamSearchLayer.cu
@@ -137,6 +137,7 @@ void BeamSearchLayer<T>::forwardAsyncSingleRequest(
     bh.earlyStoppings = mEarlyStoppingDevice;
     bh.inputLengths = ip->input_lengths->template getPtr<int const>();
     bh.endIds = ip->end_ids.template getPtr<int const>();
+    bh.minP = ip->min_p.template getPtr<float const>();
     bh.logProbsTiled = (op->output_log_probs) ? op->output_log_probs->template getPtr<float>() : nullptr;
     bh.sequenceLengths = op->sequence_length->template getPtr<int>();
     bh.cumLogProbs = op->cum_log_probs->template getPtr<float>();
@@ -183,6 +184,7 @@ void BeamSearchLayer<T>::forwardAsync(
 
     // common inputs
     auto const& endIds = params->end_ids;
+    auto const& minP = params->min_p;
     auto const localBatchSize = static_cast<std::size_t>(params->local_batch_size);
 
     TLLM_CHECK_WITH_INFO(localDecoderDomain.getBeamWidth() > 1,
@@ -209,8 +211,10 @@ void BeamSearchLayer<T>::forwardAsync(
             = params->logits->slice({dynamic_decode_batch_size, params->logits->shape[1], params->logits->shape[2]},
                 dynamic_decode_vocab_size_units_offset);
         auto const end_id_offset = endIds.slice({dynamic_decode_batch_size}, dynamic_ite * dynamic_decode_batch_size);
+        auto const min_p_offset  = minP.slice({dynamic_decode_batch_size}, dynamic_ite * dynamic_decode_batch_size);
 
-        auto forwardParams = std::make_shared<BeamSearchInputParams>(step, ite, logits_offset, end_id_offset,
+
+        auto forwardParams = std::make_shared<BeamSearchInputParams>(step, ite, logits_offset, end_id_offset, min_p_offset,
             *params->src_cache_indirection, static_cast<std::int32_t>(params->max_attention_window),
             static_cast<std::int32_t>(params->sink_token_length), static_cast<std::int32_t>(maxSeqLen));
 
diff --git a/cpp/tensorrt_llm/layers/beamSearchLayer.h b/cpp/tensorrt_llm/layers/beamSearchLayer.h
@@ -48,9 +48,9 @@ class BeamSearchInputParams : public BaseInputParams
 {
 public:
     explicit BeamSearchInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, tc::Tensor logits,
-        tc::Tensor endIds, tc::Tensor src_cache_indirection, runtime::SizeType32 max_attention_window,
-        runtime::SizeType32 sink_token_length, runtime::SizeType32 max_seq_len)
-        : BaseInputParams(step, ite, std::move(endIds))
+	tc::Tensor endIds, tc::Tensor minPs, tc::Tensor src_cache_indirection, runtime::SizeType32 max_attention_window,
+	runtime::SizeType32 sink_token_length, runtime::SizeType32 max_seq_len)
+        : BaseInputParams(step, ite, std::move(endIds), std::move(minPs))
         , logits{std::move(logits)}
         , max_attention_window{max_attention_window}
         , sink_token_length{sink_token_length}
diff --git a/cpp/tensorrt_llm/layers/decodingLayer.cpp b/cpp/tensorrt_llm/layers/decodingLayer.cpp
@@ -206,6 +206,7 @@ std::tuple<std::shared_ptr<BaseOutputParams>, std::shared_ptr<BaseInputParams>>
     auto const localDecoderDomain = getLocalDecoderDomain(params);
     auto const maxSeqLen = outputs->output_ids.shape[outputs->output_ids.shape.size() - 1];
     auto const& endIds = params->end_ids;
+    auto const& minP = params->min_p;
 
     std::shared_ptr<BaseOutputParams> preparedOutputs;
     std::shared_ptr<BaseInputParams> preparedInputs;
@@ -230,8 +231,9 @@ std::tuple<std::shared_ptr<BaseOutputParams>, std::shared_ptr<BaseInputParams>>
         Tensor const logitsSlice{params->logits->slice(
             {localBatchSize, static_cast<size_t>(localDecoderDomain.getBeamWidth()), params->logits->shape[2]}, 0)};
         Tensor const endIdSlice{endIds.slice({localBatchSize}, 0)};
+        Tensor const minPSlice{minP.slice({localBatchSize}, 0)};
         auto decodeInputs = std::make_shared<SamplingInputParams>(
-            step, ite, logitsSlice, endIdSlice, static_cast<SizeType32>(maxSeqLen));
+            step, ite, logitsSlice, endIdSlice, minPSlice, static_cast<SizeType32>(maxSeqLen));
 
         decodeInputs->finished = params->finished;
 
@@ -274,7 +276,7 @@ std::tuple<std::shared_ptr<BaseOutputParams>, std::shared_ptr<BaseInputParams>>
         TLLM_CHECK_WITH_INFO(localDecoderDomain.getBeamWidth() == 1,
             "Decoding mode is Medusa, but beamWidth != 1 (%d != 1)", localDecoderDomain.getBeamWidth());
 
-        auto medusaInputParams = std::make_shared<MedusaInputParams>(params->logits.value(), endIds);
+        auto medusaInputParams = std::make_shared<MedusaInputParams>(params->logits.value(), endIds, minP);
         medusaInputParams->finished = outputs->finished.value();
         medusaInputParams->batch_slots = params->batch_slots;
         medusaInputParams->paths = params->medusaInputs->medusaPaths;
diff --git a/cpp/tensorrt_llm/layers/decodingParams.h b/cpp/tensorrt_llm/layers/decodingParams.h
@@ -164,10 +164,11 @@ class DynamicDecodeSetupParams : public BaseSetupParams
 class BaseInputParams
 {
 public:
-    explicit BaseInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, tc::Tensor endIds)
+    explicit BaseInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, tc::Tensor endIds, tc::Tensor minPs)
         : step{step}
         , ite{ite}
         , end_ids{std::move(endIds)}
+        , min_p{std::move(minPs)}
     {
     }
 
@@ -177,6 +178,7 @@ class BaseInputParams
     runtime::SizeType32 step;
     runtime::SizeType32 ite;
     tc::Tensor end_ids;                    // [maxBatchSize]
+    tc::Tensor min_p;                      // [maxBatchSize]
     std::optional<tc::Tensor> batch_slots; // [forwardBatchSize], on pinned memory
     std::optional<tc::Tensor> finished;    // [maxBatchSize, maxBeamWidth]
 };
@@ -186,8 +188,8 @@ class DynamicDecodeInputParams : public BaseInputParams
 public:
     DynamicDecodeInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, runtime::SizeType32 maxInputLength,
         runtime::SizeType32 maxAttentionWindow, runtime::SizeType32 sinkTokenLength, runtime::SizeType32 localBatchSize,
-        tc::Tensor endIds)
-        : BaseInputParams(step, ite, std::move(endIds))
+        tc::Tensor endIds, tc::Tensor minPs)
+        : BaseInputParams(step, ite, std::move(endIds), std::move(minPs))
         , max_input_length{maxInputLength}
         , max_attention_window{maxAttentionWindow}
         , sink_token_length{sinkTokenLength}
diff --git a/cpp/tensorrt_llm/layers/medusaDecodingLayer.h b/cpp/tensorrt_llm/layers/medusaDecodingLayer.h
@@ -44,8 +44,8 @@ class MedusaSetupParams : public BaseSetupParams
 class MedusaInputParams : public BaseInputParams
 {
 public:
-    explicit MedusaInputParams(tc::Tensor logits, tc::Tensor endIds)
-        : BaseInputParams{0, 0, std::move(endIds)}
+    explicit MedusaInputParams(tc::Tensor logits, tc::Tensor endIds, tc::Tensor minPs)
+        : BaseInputParams{0, 0, std::move(endIds), std::move(minPs)}
         , logits{std::move(logits)}
     {
     }
diff --git a/cpp/tensorrt_llm/layers/samplingLayer.cpp b/cpp/tensorrt_llm/layers/samplingLayer.cpp
@@ -179,6 +179,7 @@ void SamplingLayer<T>::forwardAsync(
 
     auto logits = inputs->logits.template getPtr<T>();
     auto endIds = inputs->end_ids.template getPtr<int const>();
+    auto minPs = inputs->min_p.template getPtr<const float>();
     auto batchSlots = inputs->batch_slots ? inputs->batch_slots->template getPtr<int const>() : nullptr;
     float* cumLogProbs = (outputs->cum_log_probs) ? outputs->cum_log_probs->template getPtr<float>() : nullptr;
     float* outputLogProbs = (outputs->output_log_probs) ? outputs->output_log_probs->template getPtr<float>() : nullptr;
@@ -190,7 +191,9 @@ void SamplingLayer<T>::forwardAsync(
     auto const skipTopP = !mDecodingMode.isTopP();
 
     // Compute probabilities either for TopP or if cumLogProbs or outputLogProbs are specified
-    bool const skipSoftMax = skipTopP && !mOutputLogProbs && !mCumLogProbs;
+    //bool const skipSoftMax = skipTopP && !mOutputLogProbs && !mCumLogProbs;
+    // FIXME: We can't skip softmax if min_p is in use.
+    bool const skipSoftMax = false;
 
     inputs->curand_states = mCurandStatesDevice;
     inputs->sampling_workspace = mSamplingWorkspaceDevice;
@@ -199,7 +202,7 @@ void SamplingLayer<T>::forwardAsync(
     {
         invokeAddBiasSoftMax(logits, (T**) nullptr, logits, (T*) (nullptr), endIds, finishedInput, batchSlots,
             batchSize, mDecoderDomain.getBatchSize(), /* bw */ 1, mDecoderDomain.getVocabSize(),
-            mDecoderDomain.getVocabSizePadded(), skipSoftMax, /* batchSlotLogits */ false, mStream);
+            mDecoderDomain.getVocabSizePadded(), skipSoftMax, /* batchSlotLogits */ false, minPs, mStream);
         sync_check_cuda_error();
     }
 
diff --git a/cpp/tensorrt_llm/layers/samplingParams.h b/cpp/tensorrt_llm/layers/samplingParams.h
@@ -45,8 +45,8 @@ class SamplingInputParams : public BaseInputParams
 {
 public:
     explicit SamplingInputParams(runtime::SizeType32 step, runtime::SizeType32 ite, tc::Tensor logits,
-        tc::Tensor end_ids, runtime::SizeType32 max_seq_len)
-        : BaseInputParams{step, ite, std::move(end_ids)}
+        tc::Tensor end_ids, tc::Tensor min_p, runtime::SizeType32 max_seq_len)
+        : BaseInputParams{step, ite, std::move(end_ids), std::move(min_p)}
         , logits{std::move(logits)}
         , max_seq_len{max_seq_len}
     {
diff --git a/cpp/tensorrt_llm/runtime/gptDecoder.cpp b/cpp/tensorrt_llm/runtime/gptDecoder.cpp
@@ -150,7 +150,7 @@ std::shared_ptr<tl::DynamicDecodeInputParams> prepareInputs(DecodingInput const&
 {
     auto constexpr ite = 0; // no pipeline parallelism
     auto forwardParams = std::make_shared<tl::DynamicDecodeInputParams>(input.step, ite, input.maxLength,
-        input.maxAttentionWindow, input.sinkTokenLength, input.maxBatchSize, tcc::toTllmTensor(*input.endIds));
+        input.maxAttentionWindow, input.sinkTokenLength, input.maxBatchSize, tcc::toTllmTensor(*input.endIds), tcc::toTllmTensor(*input.minP));
 
     if (input.logitsVec)
     {
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
@@ -91,7 +91,8 @@ GptDecoderBatch::GptDecoderBatch(
     auto& dInput = mJointDecodingInput;
     auto dummyLogits = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
     auto endIds = mBufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType);
-    dInput = std::make_unique<DecodingInput>(0, 0, 0, 0, std::move(dummyLogits), std::move(endIds));
+    auto minP = mBufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
+    dInput = std::make_unique<DecodingInput>(0, 0, 0, 0, std::move(dummyLogits), std::move(endIds), std::move(minP));
 
     dInput->sequenceLimitLength = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
     dInput->lengths = mBufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
@@ -196,6 +197,7 @@ void GptDecoderBatch::setup(executor::DecodingMode const& mode, SizeType32 maxBa
     dInput.maxAttentionWindow = mMaxAttentionWindow;
     dInput.sinkTokenLength = mSinkTokenLength;
     const_cast<ITensor&>(*dInput.endIds).reshape(maxBatchSizeXmaxBeamWidth);
+    const_cast<ITensor&>(*dInput.minP).reshape(maxBatchSizeXmaxBeamWidth);
     auto& sequenceLimitLength = const_cast<ITensor&>(*dInput.sequenceLimitLength);
     sequenceLimitLength.reshape(maxBatchSizeShape);
     kernels::invokeFill(sequenceLimitLength, mMaxSequenceLength, *mStream);
@@ -391,8 +393,23 @@ void GptDecoderBatch::newRequest(
 
     TensorPtr endIdTensorPtr{ITensor::slice(constPointerCast(dJointInput.endIds), batchIdx, localBatchSize)};
     kernels::invokeFill(*endIdTensorPtr, endId, *stream);
+    TensorPtr minPTensorPtr{ITensor::slice(constPointerCast(dJointInput.minP), batchIdx, localBatchSize)};
+    int wordsLen = 0;
+    if (request.badWordsList)
+    {
+        wordsLen = request.badWordsList->getShape().d[1];
+    }
+
+    if (wordsLen > 0)
+    {
+        // copying int bits to float: avoid the type check in ::copy(ITensor, ITensor)
+        manager.copy(request.badWordsList->data(), *minPTensorPtr, minPTensorPtr->getMemoryType());
+    } else {
+        kernels::invokeFill(*minPTensorPtr, 0.0f, *stream);
+    }
+
     dInput = std::make_unique<DecodingInput>(
-        inputLength, mMaxAttentionWindow, mSinkTokenLength, localBatchSize, dJointInput.logits, endIdTensorPtr);
+        inputLength, mMaxAttentionWindow, mSinkTokenLength, localBatchSize, dJointInput.logits, endIdTensorPtr, minPTensorPtr);
 
     TensorPtr embeddingBiasSlice
         = ITensor::slice(constPointerCast(dJointInput.embeddingBias), batchIdx, localBatchSize);
diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp
diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ __global__ void ban_bad_words(T* logits, TokenIdType const** output_ids_ptr, Siz`
`31`	`31`	`SizeType32 const* bad_words_lens, SizeType32 vocab_size_padded, SizeType32 const* sequence_lengths,`
`32`	`32`	`SizeType32 max_seq_len)`
`33`	`33`	`{`
`34`		`- auto const id = blockIdx.x * blockDim.x + threadIdx.x;`
	`34`	`+ auto const id = blockIdx.x * blockDim.x + threadIdx.x + 1;`
`35`	`35`	`auto const batch_idx = blockIdx.y / beam_width;`
`36`	`36`	`auto const beam_idx = blockIdx.y % beam_width;`
`37`	`37`	`auto const batch_slot = batch_slots != nullptr ? batch_slots[batch_idx] : batch_idx;`
Original file line number	Diff line number	Diff line change
`@@ -758,11 +758,11 @@ void acceptDraftTokensByLogits(T* draftLogits, T** targetLogits, T* draftProbs,`
`758`	`758`	`invokeAddBiasSoftMax(draftLogits, static_cast<T*>(nullptr), draftProbs, static_cast<T>(nullptr), nullptr,`
`759`	`759`	`finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded,`
`760`	`760`	`/* skip softmax */ false,`
`761`		`- /* batchSlotLogits */ true, stream);`
	`761`	`+ /* batchSlotLogits / true, (float) (nullptr), stream);`
`762`	`762`	`invokeAddBiasSoftMax(static_cast<T>(nullptr), targetLogits, targetProbs, static_cast<T>(nullptr), nullptr,`
`763`	`763`	`finished, batchSlots, batchSize, maxBatchSize, beamWidth * maxDraftTokens, vocabSize, vocabSizePadded,`
`764`	`764`	`/* skip softmax */ false,`
`765`		`- /* batchSlotLogits */ true, stream);`
	`765`	`+ /* batchSlotLogits / true, (float) (nullptr), stream);`
`766`	`766`	`}`
`767`	`767`	`{`
`768`	`768`	`dim3 block(1024);`