From ca857ff09608d8528213a5910c5b11ad6bb81bbe Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 19 Nov 2025 17:58:02 +0100 Subject: [PATCH 01/50] Video to text python sample. --- samples/deployment-requirements.txt | 1 + samples/python/visual_language_chat/README.md | 17 ++- .../video_to_text_chat.py | 100 ++++++++++++++++++ tests/python_tests/samples/conftest.py | 7 +- 4 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 samples/python/visual_language_chat/video_to_text_chat.py diff --git a/samples/deployment-requirements.txt b/samples/deployment-requirements.txt index 0f28255dc2..910805dc48 100644 --- a/samples/deployment-requirements.txt +++ b/samples/deployment-requirements.txt @@ -4,3 +4,4 @@ librosa==0.11.0 # For Whisper pillow==12.0.0 # Image processing for VLMs json5==0.12.1 # For ReAct pydantic==2.12.4 # For Structured output json schema +opencv-python # For video-to-text VLM sample diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md index fb82aa4023..3d82f13bc7 100644 --- a/samples/python/visual_language_chat/README.md +++ b/samples/python/visual_language_chat/README.md @@ -2,8 +2,9 @@ This example showcases inference of text-generation Vision Language Models (VLMs): `miniCPM-V-2_6` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.VLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant. -There are two sample files: +There are three sample files: - [`visual_language_chat.py`](./visual_language_chat.py) demonstrates basic usage of the VLM pipeline. + - [`video_to_text_chat.py`](./video_to_text_chat.py) demonstrates video to text usage of the VLM pipeline. - [`benchmark_vlm.py`](./benchmark_vlm.py) shows how to benchmark a VLM in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text and calculating various performance metrics. ## Download and convert the model and tokenizers @@ -38,14 +39,22 @@ tokenizer = AutoTokenizer.from_pretrained("openbmb/MiniCPM-V-2_6") export_tokenizer(tokenizer, output_dir) ``` -## Run: +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` to run VLM samples. -[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. +## Run image-to-text chat sample: -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: +[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. `python visual_language_chat.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` +## Run video-to-text chat sample: + +To run this sample a model that supports video input is required, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`. + +[This video](https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4) can be used as a sample video. + +`python video_to_text_chat.py ./LLaVA-NeXT-Video-7B-hf/ sample_demo_1.mp4` + Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. # TODO: examples of larger models Modify the source code to change the device for inference to the GPU. diff --git a/samples/python/visual_language_chat/video_to_text_chat.py b/samples/python/visual_language_chat/video_to_text_chat.py new file mode 100644 index 0000000000..ca137ab0f3 --- /dev/null +++ b/samples/python/visual_language_chat/video_to_text_chat.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +import argparse +import numpy as np +import cv2 +import openvino_genai +from openvino import Tensor +from pathlib import Path + + +def streamer(subword: str) -> bool: + ''' + + Args: + subword: sub-word of the generated text. + + Returns: Return flag corresponds whether generation should be stopped. + + ''' + print(subword, end='', flush=True) + + # No value is returned as in this example we don't want to stop the generation in this method. + # "return None" will be treated the same as "return openvino_genai.StreamingStatus.RUNNING". + + +def read_video(path: str, num_frames: int = 10) -> Tensor: + ''' + + Args: + path: The path to the video. + + Returns: the ov.Tensor containing the video. + + ''' + cap = cv2.VideoCapture(path) + + frames = [] + + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + frames.append(np.array(frame)) + cap.release() + + indices = np.arange(0, len(frames), len(frames) / num_frames).astype(int) + frames = [frames[i] for i in indices] + + return Tensor(frames) + + +def read_videos(path: str) -> list[Tensor]: + entry = Path(path) + if entry.is_dir(): + return [read_video(str(file)) for file in sorted(entry.iterdir())] + return [read_video(path)] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir', help="Path to the model directory") + parser.add_argument('video_dir', help="Path to a video file.") + parser.add_argument('device', nargs='?', default='CPU', help="Device to run the model on (default: CPU)") + args = parser.parse_args() + + video = read_videos(args.video_dir) + + # GPU and NPU can be used as well. + # Note: If NPU is selected, only the language model will be run on the NPU. + enable_compile_cache = dict() + if args.device == "GPU": + # Cache compiled models on disk for GPU to save time on the next run. + # It's not beneficial for CPU. + enable_compile_cache["CACHE_DIR"] = "vlm_cache" + + pipe = openvino_genai.VLMPipeline(args.model_dir, args.device, **enable_compile_cache) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + prompt = input('question:\n') + pipe.generate(prompt, videos=video, generation_config=config, streamer=streamer) + + while True: + try: + prompt = input("\n----------\n" + "question:\n") + except EOFError: + break + pipe.generate(prompt, generation_config=config, streamer=streamer) + pipe.finish_chat() + + +if '__main__' == __name__: + main() diff --git a/tests/python_tests/samples/conftest.py b/tests/python_tests/samples/conftest.py index 8a011ccfe2..659d548d0f 100644 --- a/tests/python_tests/samples/conftest.py +++ b/tests/python_tests/samples/conftest.py @@ -143,6 +143,10 @@ "tiny-random-SpeechT5ForTextToSpeech": { "name": "hf-internal-testing/tiny-random-SpeechT5ForTextToSpeech", "convert_args": ["--model-kwargs", json.dumps({"vocoder": "fxmarty/speecht5-hifigan-tiny"})] + }, + "tiny-random-llava-next-video": { + "name": "optimum-intel-internal-testing/tiny-random-llava-next-video", + "convert_args": ["--task", "image-text-to-text"] } } @@ -159,7 +163,8 @@ "cat.png": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png", "cat": "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", "3283_1447_000.tar.gz": "https://huggingface.co/datasets/facebook/multilingual_librispeech/resolve/main/data/mls_polish/train/audio/3283_1447_000.tar.gz", - "cmu_us_awb_arctic-wav-arctic_a0001.bin": "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_a0001.bin" + "cmu_us_awb_arctic-wav-arctic_a0001.bin": "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_a0001.bin", + "videos/sample_video.mp4": "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4", } SAMPLES_PY_DIR = Path(os.environ.get("SAMPLES_PY_DIR", os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../samples/python")))) From 3b3c69d3df8e2a31e348fec60385a254c36303e2 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 20 Nov 2025 10:59:15 +0100 Subject: [PATCH 02/50] Sample test. --- .../samples/test_video_to_text_chat.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/python_tests/samples/test_video_to_text_chat.py diff --git a/tests/python_tests/samples/test_video_to_text_chat.py b/tests/python_tests/samples/test_video_to_text_chat.py new file mode 100644 index 0000000000..b8023cf417 --- /dev/null +++ b/tests/python_tests/samples/test_video_to_text_chat.py @@ -0,0 +1,36 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import pytest +import subprocess # nosec B404 +import sys + +from conftest import SAMPLES_PY_DIR, SAMPLES_CPP_DIR, SAMPLES_C_DIR +from test_utils import run_sample + +class TestVisualLanguageChat: + @pytest.mark.vlm + @pytest.mark.samples + @pytest.mark.parametrize( + "convert_model, download_test_content, questions", + [ + pytest.param("tiny-random-llava-next-video", "videos/sample_video.mp4", 'What is unusual on this video?\nGo on.') + ], + indirect=["convert_model", "download_test_content"], + ) + def test_sample_visual_language_chat(self, convert_model, download_test_content, questions): + # Test CPP sample + # TODO + + # Test C sample + # TODO + + # Test Python sample + py_script = os.path.join(SAMPLES_PY_DIR, "visual_language_chat/video_to_text_chat.py") + py_command = [sys.executable, py_script, convert_model, download_test_content] + py_result = run_sample(py_command, questions) + + # Compare results + # assert py_result.stdout == cpp_result.stdout, f"Results should match" + # assert cpp_result.stdout == c_result.stdout, f"Results should match" From b4a84f773460e96c273197c572ec2b37d6e4169e Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 20 Nov 2025 10:41:58 +0100 Subject: [PATCH 03/50] Update samples/python/visual_language_chat/video_to_text_chat.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- samples/python/visual_language_chat/video_to_text_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/python/visual_language_chat/video_to_text_chat.py b/samples/python/visual_language_chat/video_to_text_chat.py index ca137ab0f3..72f086c74a 100644 --- a/samples/python/visual_language_chat/video_to_text_chat.py +++ b/samples/python/visual_language_chat/video_to_text_chat.py @@ -96,5 +96,5 @@ def main(): pipe.finish_chat() -if '__main__' == __name__: +if __name__ == '__main__': main() From 29d78c43a0ca653ad22d027da7f26dbb7aefb77b Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 24 Nov 2025 12:28:55 +0100 Subject: [PATCH 04/50] Added c++ sample. --- .../cpp/visual_language_chat/CMakeLists.txt | 8 ++ samples/cpp/visual_language_chat/README.md | 16 ++- .../video_to_text_chat.cpp | 123 ++++++++++++++++++ .../video_to_text_chat.py | 5 +- 4 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 samples/cpp/visual_language_chat/video_to_text_chat.cpp diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 54795351a6..663f1e40ab 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -8,6 +8,8 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) +find_package(OpenCV REQUIRED) + file(DOWNLOAD https://raw.githubusercontent.com/nothings/stb/f75e8d1cad7d90d72ef7a4661f1b994ef78b4e31/stb_image.h ${CMAKE_BINARY_DIR}/stb_image.h @@ -55,3 +57,9 @@ install(TARGETS benchmark_vlm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) + +# create video_to_text_chat executable +add_executable(video_to_text_chat video_to_text_chat.cpp) + +target_include_directories(video_to_text_chat PRIVATE ${OpenCV_INCLUDE_DIRS} "${CMAKE_BINARY_DIR}") +target_link_libraries(video_to_text_chat PRIVATE ${OpenCV_LIBS} openvino::genai cxxopts::cxxopts) \ No newline at end of file diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md index 59819e45cc..4c283f57a4 100644 --- a/samples/cpp/visual_language_chat/README.md +++ b/samples/cpp/visual_language_chat/README.md @@ -3,8 +3,9 @@ This example showcases inference of Visual language models (VLMs). The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::VLMPipeline` and runs the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant. -There are two sample files: +There are three sample files: - [`visual_language_chat.cpp`](./visual_language_chat.cpp) demonstrates basic usage of the VLM pipeline. + - [`video_to_text_chat.cpp`](./video_to_text_chat.cpp) demonstrates video to text usage of the VLM pipeline. - [`benchmark_vlm.cpp`](./benchmark_vlm.cpp) shows how to benchmark a VLM in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text and calculating various performance metrics. @@ -19,9 +20,9 @@ pip install --upgrade-strategy eager -r ../../requirements.txt optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6 ``` -## Run +Follow [Get Started with Samples](https://docs.openvino.ai/2025/get-started/learn-openvino/openvino-samples/get-started-demos.html) to run samples. -Follow [Get Started with Samples](https://docs.openvino.ai/2025/get-started/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. +## Run visual language chat: [This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. @@ -31,6 +32,15 @@ Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is Refer to the [Supported Models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) for more details. + +## Run video to text chat: + +To run this sample a model that supports video input is required, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`. + +[This video](https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4) can be used as a sample video. + +`video_to_text_chat ./LLaVA-NeXT-Video-7B-hf/ sample_demo_1.mp4` + ## Run benchmark: ```sh diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp new file mode 100644 index 0000000000..91c7b66df1 --- /dev/null +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -0,0 +1,123 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include + +namespace fs = std::filesystem; + +std::vector make_indices(size_t total_frames, size_t num_frames) { + std::vector indices; + indices.reserve(num_frames); + + auto step = float(total_frames) / num_frames; + + for (size_t i = 0; i < num_frames; ++i) { + size_t idx = std::min(size_t(i * step), total_frames - 1); + indices.push_back(idx); + } + + return indices; +} + +ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames = 10) { + cv::VideoCapture cap(video_path); + + if (!cap.isOpened()) { + OPENVINO_THROW("Could not open the video file."); + } + size_t total_num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT); + auto indices = make_indices(total_num_frames, num_frames); + + std::vector frames; + cv::Mat frame; + size_t width = cap.get(cv::CAP_PROP_FRAME_WIDTH); + size_t height = cap.get(cv::CAP_PROP_FRAME_HEIGHT); + ov::Tensor video_tensor(ov::element::u8, ov::Shape{num_frames, height, width, 3}); + auto video_tensor_data = video_tensor.data(); + + size_t frame_idx = 0; + while (cap.read(frame)) { + if (std::find(indices.begin(), indices.end(), frame_idx) != indices.end()) { + memcpy(video_tensor_data, frame.data, frame.total() * 3 * sizeof(uint8_t)); + video_tensor_data += frame.total() * 3; + } + frame_idx++; + } + + return video_tensor; +} + +std::vector load_videos(const std::filesystem::path& input_path) { + if (input_path.empty() || !fs::exists(input_path)) { + throw std::runtime_error{"Path to images is empty or does not exist."}; + } + if (fs::is_directory(input_path)) { + std::set sorted_videos{fs::directory_iterator(input_path), fs::directory_iterator()}; + std::vector videos; + for (const fs::path& dir_entry : sorted_videos) { + videos.push_back(load_video(dir_entry)); + } + return videos; + } + return {load_video(input_path)}; +} + +bool print_subword(std::string&& subword) { + return !(std::cout << subword << std::flush); +} + +int main(int argc, char* argv[]) try { + if (argc < 3 || argc > 4) { + throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); + } + + std::vector videos = load_videos(argv[2]); + + // GPU and NPU can be used as well. + // Note: If NPU is selected, only language model will be run on NPU + std::string device = (argc == 4) ? argv[3] : "CPU"; + ov::AnyMap enable_compile_cache; + if (device == "GPU") { + // Cache compiled models on disk for GPU to save time on the + // next run. It's not beneficial for CPU. + enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); + } + ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); + + ov::genai::GenerationConfig generation_config; + generation_config.max_new_tokens = 100; + + std::string prompt; + + pipe.start_chat(); + std::cout << "question:\n"; + + std::getline(std::cin, prompt); + pipe.generate(prompt, + ov::genai::videos(videos), + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} \ No newline at end of file diff --git a/samples/python/visual_language_chat/video_to_text_chat.py b/samples/python/visual_language_chat/video_to_text_chat.py index 72f086c74a..cb8e6bd503 100644 --- a/samples/python/visual_language_chat/video_to_text_chat.py +++ b/samples/python/visual_language_chat/video_to_text_chat.py @@ -45,7 +45,6 @@ def read_video(path: str, num_frames: int = 10) -> Tensor: break frames.append(np.array(frame)) - cap.release() indices = np.arange(0, len(frames), len(frames) / num_frames).astype(int) frames = [frames[i] for i in indices] @@ -67,7 +66,7 @@ def main(): parser.add_argument('device', nargs='?', default='CPU', help="Device to run the model on (default: CPU)") args = parser.parse_args() - video = read_videos(args.video_dir) + videos = read_videos(args.video_dir) # GPU and NPU can be used as well. # Note: If NPU is selected, only the language model will be run on the NPU. @@ -84,7 +83,7 @@ def main(): pipe.start_chat() prompt = input('question:\n') - pipe.generate(prompt, videos=video, generation_config=config, streamer=streamer) + pipe.generate(prompt, videos=videos, generation_config=config, streamer=streamer) while True: try: From 1a8944aab51f449aa296e88c110e35be57d1365f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 24 Nov 2025 13:13:31 +0100 Subject: [PATCH 05/50] Attempt to add opencv build to ga workflow. --- .github/workflows/linux.yml | 41 ++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index a4ad20710b..2ef666bb59 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -240,6 +240,44 @@ jobs: path: ${{ env.MANIFEST_PATH }} if-no-files-found: 'error' + opencv_build_cmake: + name: Build OpenCV + strategy: + fail-fast: false + matrix: + build-type: [Release] + timeout-minutes: 45 + defaults: + run: + shell: bash + runs-on: aks-linux-4-cores-16gb + container: + image: openvinogithubactions.azurecr.io/ov_build/ubuntu_22_04_x64:${{ needs.openvino_download.outputs.docker_tag }} + volumes: + - /mount:/mount + - ${{ github.workspace }}:${{ github.workspace }} + options: -e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING + env: + CMAKE_GENERATOR: Unix Makefiles + BUILD_DIR: ${{ github.workspace }}/build_opencv + SRC_DIR: ${{ github.workspace }}/src_opencv + MANIFEST_PATH: ${{ github.workspace }}/manifest.yml + + steps: + - name: Clone OpenCV + uses: actions/checkout@4.x + with: + repository: 'opencv/opencv' + path: ${{ env.SRC_DIR }} + + - name: CMake Build + run: | + mkdir {{ env.BUILD_DIR }} + cd {{ env.BUILD_DIR }} + cmake -DCMAKE_BUILD_TYPE=Release -DWITH_INF_ENGINE=y -DOpenVINO_DIR=$GITHUB_WORKSPACE/ov/runtime/cmake/ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_LINKER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_C_LINKER_LAUNCHER=ccache -DBUILD_TESTS=y -DVIDEOIO_ENABLE_PLUGINS=y -DBUILD_PERF_TESTS=n -DBUILD_EXAMPLES=n -DBUILD_opencv_apps=y -DWITH_OPENCL=n -DWITH_OPENCLAMDBLAS=n -DWITH_GSTREAMER=n -DWITH_V4L=ON -DWITH_LIBV4L=ON -DWITH_OPENCLAMDFFT=n -DWITH_VA=n -DWITH_VA_INTEL=n -DWITH_PROTOBUF=n -DBUILD_PROTOBUF=n -DBUILD_JAVA=n -DBUILD_opencv_java_bindings_generator=n -DBUILD_opencv_python2=n -DBUILD_opencv_python3=n -DWITH_IMGCODEC_HDR=y -DWITH_IMGCODEC_SUNRASTER=y -DWITH_IMGCODEC_PXM=y -DWITH_IMGCODEC_PFM=y -DWITH_PNG=y -DWITH_TIFF=n -DWITH_WEBP=n -DWITH_OPENJPEG=n -DWITH_JASPER=n -DWITH_OPENEXR=n -DBUILD_opencv_dnn=n -DBUILD_opencv_features2d=n -DBUILD_opencv_flann=n -DWITH_TBB=n -DBUILD_INFO_SKIP_EXTRA_MODULES=n -DBUILD_JASPER=n -DBUILD_PNG=n -DBUILD_OPENEXR=n -DBUILD_WEBP=n -DBUILD_ZLIB=n -DWITH_CUDA=n -DWITH_EIGEN=n -DWITH_GPHOTO2=n -DOPENCV_GAPI_GSTREAMER=n -DWITH_LAPACK=n -DWITH_MATLAB=n -DWITH_MFX=n -DWITH_QUIRC=n -DWITH_VTK=n -DINSTALL_PDB=n -DINSTALL_TESTS=n -DINSTALL_C_EXAMPLES=n -DINSTALL_PYTHON_EXAMPLES=n -DOPENCV_GENERATE_SETUPVARS=n -DWITH_1394=n -DWITH_FFMPEG=y -DWITH_GTK_2_X=y -DBUILD_JPEG=y -DWITH_IPP=y -DWITH_AVIF=n -DENABLE_CONFIG_VERIFICATION=y -DBUILD_LIST=core,gapi,highgui,imgcodecs,imgproc,videoio,video .. + cmake --build ${{ env.BUILD_DIR }} --config ${{ matrix.build-type }} --parallel $(nproc) --verbose + + genai_build_wheel: name: Build Wheel needs: [ openvino_download ] @@ -378,7 +416,7 @@ jobs: fail-fast: false matrix: build-type: [Release] - needs: [ openvino_download, genai_build_cmake ] + needs: [ openvino_download, genai_build_cmake, opencv_build_cmake ] timeout-minutes: 10 defaults: run: @@ -396,6 +434,7 @@ jobs: INSTALL_DIR: ${{ github.workspace }}/install BUILD_DIR: ${{ github.workspace }}/build SRC_DIR: ${{ github.workspace }}/src + OpenCV_DIR: ${{ github.workspace }}/build_opencv steps: - name: Clone openvino.genai From 1a0d25c8e3405c4a35cf6479fdbb1048e4cf984b Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 24 Nov 2025 13:32:49 +0100 Subject: [PATCH 06/50] Revert "Attempt to add opencv build to ga workflow." This reverts commit 1a8944aab51f449aa296e88c110e35be57d1365f. --- .github/workflows/linux.yml | 41 +------------------------------------ 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 2ef666bb59..a4ad20710b 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -240,44 +240,6 @@ jobs: path: ${{ env.MANIFEST_PATH }} if-no-files-found: 'error' - opencv_build_cmake: - name: Build OpenCV - strategy: - fail-fast: false - matrix: - build-type: [Release] - timeout-minutes: 45 - defaults: - run: - shell: bash - runs-on: aks-linux-4-cores-16gb - container: - image: openvinogithubactions.azurecr.io/ov_build/ubuntu_22_04_x64:${{ needs.openvino_download.outputs.docker_tag }} - volumes: - - /mount:/mount - - ${{ github.workspace }}:${{ github.workspace }} - options: -e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING - env: - CMAKE_GENERATOR: Unix Makefiles - BUILD_DIR: ${{ github.workspace }}/build_opencv - SRC_DIR: ${{ github.workspace }}/src_opencv - MANIFEST_PATH: ${{ github.workspace }}/manifest.yml - - steps: - - name: Clone OpenCV - uses: actions/checkout@4.x - with: - repository: 'opencv/opencv' - path: ${{ env.SRC_DIR }} - - - name: CMake Build - run: | - mkdir {{ env.BUILD_DIR }} - cd {{ env.BUILD_DIR }} - cmake -DCMAKE_BUILD_TYPE=Release -DWITH_INF_ENGINE=y -DOpenVINO_DIR=$GITHUB_WORKSPACE/ov/runtime/cmake/ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_LINKER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_C_LINKER_LAUNCHER=ccache -DBUILD_TESTS=y -DVIDEOIO_ENABLE_PLUGINS=y -DBUILD_PERF_TESTS=n -DBUILD_EXAMPLES=n -DBUILD_opencv_apps=y -DWITH_OPENCL=n -DWITH_OPENCLAMDBLAS=n -DWITH_GSTREAMER=n -DWITH_V4L=ON -DWITH_LIBV4L=ON -DWITH_OPENCLAMDFFT=n -DWITH_VA=n -DWITH_VA_INTEL=n -DWITH_PROTOBUF=n -DBUILD_PROTOBUF=n -DBUILD_JAVA=n -DBUILD_opencv_java_bindings_generator=n -DBUILD_opencv_python2=n -DBUILD_opencv_python3=n -DWITH_IMGCODEC_HDR=y -DWITH_IMGCODEC_SUNRASTER=y -DWITH_IMGCODEC_PXM=y -DWITH_IMGCODEC_PFM=y -DWITH_PNG=y -DWITH_TIFF=n -DWITH_WEBP=n -DWITH_OPENJPEG=n -DWITH_JASPER=n -DWITH_OPENEXR=n -DBUILD_opencv_dnn=n -DBUILD_opencv_features2d=n -DBUILD_opencv_flann=n -DWITH_TBB=n -DBUILD_INFO_SKIP_EXTRA_MODULES=n -DBUILD_JASPER=n -DBUILD_PNG=n -DBUILD_OPENEXR=n -DBUILD_WEBP=n -DBUILD_ZLIB=n -DWITH_CUDA=n -DWITH_EIGEN=n -DWITH_GPHOTO2=n -DOPENCV_GAPI_GSTREAMER=n -DWITH_LAPACK=n -DWITH_MATLAB=n -DWITH_MFX=n -DWITH_QUIRC=n -DWITH_VTK=n -DINSTALL_PDB=n -DINSTALL_TESTS=n -DINSTALL_C_EXAMPLES=n -DINSTALL_PYTHON_EXAMPLES=n -DOPENCV_GENERATE_SETUPVARS=n -DWITH_1394=n -DWITH_FFMPEG=y -DWITH_GTK_2_X=y -DBUILD_JPEG=y -DWITH_IPP=y -DWITH_AVIF=n -DENABLE_CONFIG_VERIFICATION=y -DBUILD_LIST=core,gapi,highgui,imgcodecs,imgproc,videoio,video .. - cmake --build ${{ env.BUILD_DIR }} --config ${{ matrix.build-type }} --parallel $(nproc) --verbose - - genai_build_wheel: name: Build Wheel needs: [ openvino_download ] @@ -416,7 +378,7 @@ jobs: fail-fast: false matrix: build-type: [Release] - needs: [ openvino_download, genai_build_cmake, opencv_build_cmake ] + needs: [ openvino_download, genai_build_cmake ] timeout-minutes: 10 defaults: run: @@ -434,7 +396,6 @@ jobs: INSTALL_DIR: ${{ github.workspace }}/install BUILD_DIR: ${{ github.workspace }}/build SRC_DIR: ${{ github.workspace }}/src - OpenCV_DIR: ${{ github.workspace }}/build_opencv steps: - name: Clone openvino.genai From 4d070ab5a381abc07b3863e1b8e24f086378abfd Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 25 Nov 2025 11:08:51 +0100 Subject: [PATCH 07/50] Used FetchContent to add opencv. --- .../cpp/visual_language_chat/CMakeLists.txt | 23 +++++++++++++++---- .../video_to_text_chat.cpp | 3 ++- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 663f1e40ab..e45f46b844 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -8,8 +8,6 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) -find_package(OpenCV REQUIRED) - file(DOWNLOAD https://raw.githubusercontent.com/nothings/stb/f75e8d1cad7d90d72ef7a4661f1b994ef78b4e31/stb_image.h ${CMAKE_BINARY_DIR}/stb_image.h @@ -58,8 +56,23 @@ install(TARGETS benchmark_vlm COMPONENT samples_bin EXCLUDE_FROM_ALL) -# create video_to_text_chat executable + +include(FetchContent) +FetchContent_Declare( + opencv + GIT_REPOSITORY https://github.com/opencv/opencv.git + GIT_TAG 4.6.0 + GIT_SHALLOW TRUE + GIT_PROGRESS TRUE +) +FetchContent_MakeAvailable(opencv) + + add_executable(video_to_text_chat video_to_text_chat.cpp) -target_include_directories(video_to_text_chat PRIVATE ${OpenCV_INCLUDE_DIRS} "${CMAKE_BINARY_DIR}") -target_link_libraries(video_to_text_chat PRIVATE ${OpenCV_LIBS} openvino::genai cxxopts::cxxopts) \ No newline at end of file +target_include_directories(video_to_text_chat PRIVATE + ${OPENCV_CONFIG_FILE_INCLUDE_DIR} + ${OPENCV_MODULE_opencv_core_LOCATION}/include + ${OPENCV_MODULE_opencv_videoio_LOCATION}/include + ) +target_link_libraries(video_to_text_chat opencv_core opencv_videoio openvino::genai cxxopts::cxxopts) \ No newline at end of file diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index 91c7b66df1..943934d633 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -2,7 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include +#include +#include #include #include From a8fa911222946db9a373777d5909688e9cb3cc60 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 25 Nov 2025 11:12:41 +0100 Subject: [PATCH 08/50] Corrected test. --- samples/cpp/visual_language_chat/CMakeLists.txt | 2 +- tests/python_tests/samples/test_video_to_text_chat.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index e45f46b844..8af80ef885 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -75,4 +75,4 @@ target_include_directories(video_to_text_chat PRIVATE ${OPENCV_MODULE_opencv_core_LOCATION}/include ${OPENCV_MODULE_opencv_videoio_LOCATION}/include ) -target_link_libraries(video_to_text_chat opencv_core opencv_videoio openvino::genai cxxopts::cxxopts) \ No newline at end of file +target_link_libraries(video_to_text_chat opencv_core opencv_videoio openvino::genai cxxopts::cxxopts) diff --git a/tests/python_tests/samples/test_video_to_text_chat.py b/tests/python_tests/samples/test_video_to_text_chat.py index b8023cf417..0b7c1c2405 100644 --- a/tests/python_tests/samples/test_video_to_text_chat.py +++ b/tests/python_tests/samples/test_video_to_text_chat.py @@ -21,10 +21,9 @@ class TestVisualLanguageChat: ) def test_sample_visual_language_chat(self, convert_model, download_test_content, questions): # Test CPP sample - # TODO - - # Test C sample - # TODO + cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'video_to_text_chat') + cpp_command = [cpp_sample, convert_model, download_test_content] + cpp_result = run_sample(cpp_command, questions) # Test Python sample py_script = os.path.join(SAMPLES_PY_DIR, "visual_language_chat/video_to_text_chat.py") @@ -32,5 +31,4 @@ def test_sample_visual_language_chat(self, convert_model, download_test_content, py_result = run_sample(py_command, questions) # Compare results - # assert py_result.stdout == cpp_result.stdout, f"Results should match" - # assert cpp_result.stdout == c_result.stdout, f"Results should match" + assert py_result.stdout == cpp_result.stdout, f"Results should match" From bdc694030f9c4a13751ec4bd206295e48459cdbc Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 25 Nov 2025 13:10:48 +0100 Subject: [PATCH 09/50] Convert path to string(). --- samples/cpp/visual_language_chat/video_to_text_chat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index 943934d633..fb1cc9858f 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -24,7 +24,7 @@ std::vector make_indices(size_t total_frames, size_t num_frames) { } ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames = 10) { - cv::VideoCapture cap(video_path); + cv::VideoCapture cap(video_path.string()); if (!cap.isOpened()) { OPENVINO_THROW("Could not open the video file."); From 735060ed12f85d86ee46b0252cb0c6dcca48c84f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 25 Nov 2025 13:18:52 +0100 Subject: [PATCH 10/50] Updated readme. --- samples/cpp/visual_language_chat/CMakeLists.txt | 2 +- samples/cpp/visual_language_chat/README.md | 2 ++ samples/python/visual_language_chat/README.md | 5 +++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 8af80ef885..5e1e714926 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -61,7 +61,7 @@ include(FetchContent) FetchContent_Declare( opencv GIT_REPOSITORY https://github.com/opencv/opencv.git - GIT_TAG 4.6.0 + GIT_TAG 4.12.0 GIT_SHALLOW TRUE GIT_PROGRESS TRUE ) diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md index 4c283f57a4..7ace182f4b 100644 --- a/samples/cpp/visual_language_chat/README.md +++ b/samples/cpp/visual_language_chat/README.md @@ -41,6 +41,8 @@ To run this sample a model that supports video input is required, for example `l `video_to_text_chat ./LLaVA-NeXT-Video-7B-hf/ sample_demo_1.mp4` +Supported models with video input are listed in [this section](https://openvinotoolkit.github.io/openvino.genai/docs/use-cases/image-processing/#use-image-or-video-tags-in-prompt). + ## Run benchmark: ```sh diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md index 3d82f13bc7..17f0308a14 100644 --- a/samples/python/visual_language_chat/README.md +++ b/samples/python/visual_language_chat/README.md @@ -47,6 +47,8 @@ Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pi `python visual_language_chat.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + ## Run video-to-text chat sample: To run this sample a model that supports video input is required, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`. @@ -55,12 +57,11 @@ To run this sample a model that supports video input is required, for example `l `python video_to_text_chat.py ./LLaVA-NeXT-Video-7B-hf/ sample_demo_1.mp4` +Supported models with video input are listed in [this section](https://openvinotoolkit.github.io/openvino.genai/docs/use-cases/image-processing/#use-image-or-video-tags-in-prompt). Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. # TODO: examples of larger models Modify the source code to change the device for inference to the GPU. -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - ## Run benchmark: ```sh From 5dfdcf73339b729ed51d23dbf177fd7850fcb953 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 25 Nov 2025 16:09:12 +0100 Subject: [PATCH 11/50] Set 8 frames. --- samples/cpp/visual_language_chat/video_to_text_chat.cpp | 2 +- samples/python/visual_language_chat/video_to_text_chat.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index fb1cc9858f..21b95fa620 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -23,7 +23,7 @@ std::vector make_indices(size_t total_frames, size_t num_frames) { return indices; } -ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames = 10) { +ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames = 8) { cv::VideoCapture cap(video_path.string()); if (!cap.isOpened()) { diff --git a/samples/python/visual_language_chat/video_to_text_chat.py b/samples/python/visual_language_chat/video_to_text_chat.py index cb8e6bd503..2beda53602 100644 --- a/samples/python/visual_language_chat/video_to_text_chat.py +++ b/samples/python/visual_language_chat/video_to_text_chat.py @@ -26,11 +26,12 @@ def streamer(subword: str) -> bool: # "return None" will be treated the same as "return openvino_genai.StreamingStatus.RUNNING". -def read_video(path: str, num_frames: int = 10) -> Tensor: +def read_video(path: str, num_frames: int = 8) -> Tensor: ''' Args: path: The path to the video. + num_frames: Number of frames sampled from the video. Returns: the ov.Tensor containing the video. From 43c76c9633631d0ee478b631a5fd51302b14c1af Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 25 Nov 2025 15:54:35 +0100 Subject: [PATCH 12/50] Update samples/cpp/visual_language_chat/README.md Co-authored-by: Vladimir Zlobin --- samples/cpp/visual_language_chat/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md index 7ace182f4b..c80377665e 100644 --- a/samples/cpp/visual_language_chat/README.md +++ b/samples/cpp/visual_language_chat/README.md @@ -35,7 +35,7 @@ Refer to the [Supported Models](https://openvinotoolkit.github.io/openvino.genai ## Run video to text chat: -To run this sample a model that supports video input is required, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`. +A model that supports video input is required to run this sample, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`. [This video](https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4) can be used as a sample video. From d77276f9f568f509a0e76ec3af05e6b21d07fe02 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 25 Nov 2025 17:08:44 +0100 Subject: [PATCH 13/50] Fixed opencv version, minor corrections. --- samples/cpp/visual_language_chat/CMakeLists.txt | 2 +- samples/cpp/visual_language_chat/video_to_text_chat.cpp | 1 + samples/deployment-requirements.txt | 2 +- samples/python/visual_language_chat/README.md | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 5e1e714926..6ad92d0fee 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -61,7 +61,7 @@ include(FetchContent) FetchContent_Declare( opencv GIT_REPOSITORY https://github.com/opencv/opencv.git - GIT_TAG 4.12.0 + GIT_TAG 4.11.0 GIT_SHALLOW TRUE GIT_PROGRESS TRUE ) diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index 21b95fa620..df2f19661d 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -41,6 +41,7 @@ ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames size_t frame_idx = 0; while (cap.read(frame)) { + OPENVINO_ASSERT(frame.cols == width && frame.rows == height && frame.channels() == 3); if (std::find(indices.begin(), indices.end(), frame_idx) != indices.end()) { memcpy(video_tensor_data, frame.data, frame.total() * 3 * sizeof(uint8_t)); video_tensor_data += frame.total() * 3; diff --git a/samples/deployment-requirements.txt b/samples/deployment-requirements.txt index 910805dc48..f776aaa555 100644 --- a/samples/deployment-requirements.txt +++ b/samples/deployment-requirements.txt @@ -4,4 +4,4 @@ librosa==0.11.0 # For Whisper pillow==12.0.0 # Image processing for VLMs json5==0.12.1 # For ReAct pydantic==2.12.4 # For Structured output json schema -opencv-python # For video-to-text VLM sample +opencv-python==4.12.0.88 # For video-to-text VLM sample diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md index 17f0308a14..6c0ea68623 100644 --- a/samples/python/visual_language_chat/README.md +++ b/samples/python/visual_language_chat/README.md @@ -51,7 +51,7 @@ See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md# ## Run video-to-text chat sample: -To run this sample a model that supports video input is required, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`. +A model that supports video input is required to run this sample, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`. [This video](https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4) can be used as a sample video. From cabd7632df5ec0e13d0698a87870087bf03ae2dd Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 25 Nov 2025 17:13:52 +0100 Subject: [PATCH 14/50] Added assert. --- samples/cpp/visual_language_chat/video_to_text_chat.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index df2f19661d..f9edba31bc 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -48,6 +48,7 @@ ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames } frame_idx++; } + OPENVINO_ASSERT(frame_idx == total_num_frames); return video_tensor; } From 46e7d5d5a58a31fa07be452927fbf4b60ee44c7c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 26 Nov 2025 10:00:35 +0100 Subject: [PATCH 15/50] Increase samples build timeout. --- .github/workflows/linux.yml | 2 +- .github/workflows/mac.yml | 2 +- .github/workflows/windows.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index a4ad20710b..1cf03b06e0 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -379,7 +379,7 @@ jobs: matrix: build-type: [Release] needs: [ openvino_download, genai_build_cmake ] - timeout-minutes: 10 + timeout-minutes: 30 defaults: run: shell: bash diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 86dedec156..15a7794b2f 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -318,7 +318,7 @@ jobs: matrix: build-type: [Release] needs: [ openvino_download, genai_build_cmake ] - timeout-minutes: 10 + timeout-minutes: 30 defaults: run: shell: bash diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 93a03be433..f675a9d9fb 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -487,7 +487,7 @@ jobs: matrix: build-type: [Release, Debug] needs: [ openvino_download, genai_build_cpack ] - timeout-minutes: 10 + timeout-minutes: 30 defaults: run: shell: pwsh From 5b6044d68ee4c594939bbb567d3b06f2faaa5f9f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 26 Nov 2025 14:11:36 +0100 Subject: [PATCH 16/50] Cmake corrected. --- samples/cpp/visual_language_chat/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 6ad92d0fee..f5afedecc3 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -76,3 +76,12 @@ target_include_directories(video_to_text_chat PRIVATE ${OPENCV_MODULE_opencv_videoio_LOCATION}/include ) target_link_libraries(video_to_text_chat opencv_core opencv_videoio openvino::genai cxxopts::cxxopts) + +set_target_properties(video_to_text_chat PROPERTIES + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS video_to_text_chat + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) \ No newline at end of file From 54cebe69e71b1423cc340a7f4f42613f70abca6e Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 27 Nov 2025 13:04:09 +0100 Subject: [PATCH 17/50] Attempt to fix ci. --- samples/cpp/visual_language_chat/CMakeLists.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index f5afedecc3..6d4d59bc2a 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -1,6 +1,10 @@ # Copyright (C) 2023-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +if (MSVC) + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") +endif() + find_package(OpenVINOGenAI REQUIRED PATHS "${CMAKE_BINARY_DIR}" # Reuse the package from the build. @@ -84,4 +88,10 @@ set_target_properties(video_to_text_chat PROPERTIES install(TARGETS video_to_text_chat RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) \ No newline at end of file + EXCLUDE_FROM_ALL) + + +install(DIRECTORY ${opencv_BINARY_DIR}/lib/ + DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) From e8cb51e92015f319f94482125851edf7e4d5e97d Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 27 Nov 2025 16:21:16 +0100 Subject: [PATCH 18/50] Fix on win. --- samples/cpp/visual_language_chat/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 6d4d59bc2a..20b817c4b5 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 if (MSVC) - set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") + set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") endif() find_package(OpenVINOGenAI REQUIRED From 6dac23e704f017840004416cbb4a1952955bf29e Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 28 Nov 2025 10:44:07 +0100 Subject: [PATCH 19/50] Apply suggestions from code review Co-authored-by: Yaroslav Tarkan --- samples/cpp/visual_language_chat/README.md | 4 ++-- samples/cpp/visual_language_chat/video_to_text_chat.cpp | 2 +- samples/python/visual_language_chat/video_to_text_chat.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md index c80377665e..95d432436d 100644 --- a/samples/cpp/visual_language_chat/README.md +++ b/samples/cpp/visual_language_chat/README.md @@ -22,7 +22,7 @@ optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code Mi Follow [Get Started with Samples](https://docs.openvino.ai/2025/get-started/learn-openvino/openvino-samples/get-started-demos.html) to run samples. -## Run visual language chat: +## Run image-to-text chat sample: [This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. @@ -33,7 +33,7 @@ Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is Refer to the [Supported Models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) for more details. -## Run video to text chat: +## Run video-to-text chat sample: A model that supports video input is required to run this sample, for example `llava-hf/LLaVA-NeXT-Video-7B-hf`. diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index f9edba31bc..984ac72792 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include diff --git a/samples/python/visual_language_chat/video_to_text_chat.py b/samples/python/visual_language_chat/video_to_text_chat.py index 2beda53602..0c1cb76299 100644 --- a/samples/python/visual_language_chat/video_to_text_chat.py +++ b/samples/python/visual_language_chat/video_to_text_chat.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 From faffe5975286c719cfa4b82e6d56625c0bcfdc15 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 28 Nov 2025 13:03:42 +0100 Subject: [PATCH 20/50] Attempt too fix error. --- samples/cpp/visual_language_chat/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 20b817c4b5..8102b11174 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -5,6 +5,8 @@ if (MSVC) set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") endif() +set(BUILD_SHARED_LIBS ON) + find_package(OpenVINOGenAI REQUIRED PATHS "${CMAKE_BINARY_DIR}" # Reuse the package from the build. From 6e749ff0f4db97cfd0ce8d2654c98c3d0b43c45a Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 28 Nov 2025 15:38:39 +0100 Subject: [PATCH 21/50] Attempt to fix cmake. --- samples/cpp/visual_language_chat/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 8102b11174..0126aa8369 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -84,6 +84,8 @@ target_include_directories(video_to_text_chat PRIVATE target_link_libraries(video_to_text_chat opencv_core opencv_videoio openvino::genai cxxopts::cxxopts) set_target_properties(video_to_text_chat PROPERTIES + INSTALL_RPATH "\$ORIGIN/lib" + BUILD_RPATH "\$ORIGIN/lib" # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) From f71bb594646447ff1b482b9256e2840557c92321 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 1 Dec 2025 11:15:51 +0100 Subject: [PATCH 22/50] Attempt to fix. --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 1cf03b06e0..4992993d68 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -723,7 +723,7 @@ jobs: if: ${{ matrix.test.run_condition }} run: python -m pytest -vvs ${{ env.SRC_DIR }}/${{ matrix.test.cmd }} -m "${{ matrix.test.marker }}" env: - LD_LIBRARY_PATH: "${{ env.INSTALL_DIR }}/runtime/lib/intel64:${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb/lib:$LD_LIBRARY_PATH" # Required for C++ samples + LD_LIBRARY_PATH: "${{ env.INSTALL_DIR }}/lib:${{ env.INSTALL_DIR }}/runtime/lib/intel64:${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb/lib:$LD_LIBRARY_PATH" # Required for C++ samples SAMPLES_PY_DIR: "${{ env.INSTALL_DIR }}/samples/python" SAMPLES_JS_DIR: "${{ env.SRC_DIR }}/samples/js" SAMPLES_CPP_DIR: "${{ env.INSTALL_DIR }}/samples_bin" From bbf700cec5e2752078c78ac13b4180506286ecf1 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 1 Dec 2025 12:22:46 +0100 Subject: [PATCH 23/50] Change video. --- tests/python_tests/samples/conftest.py | 3 +-- tests/python_tests/samples/test_video_to_text_chat.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/python_tests/samples/conftest.py b/tests/python_tests/samples/conftest.py index 7d71107ef9..3aa71fcb3c 100644 --- a/tests/python_tests/samples/conftest.py +++ b/tests/python_tests/samples/conftest.py @@ -173,8 +173,7 @@ "cat": "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", "3283_1447_000.tar.gz": "https://huggingface.co/datasets/facebook/multilingual_librispeech/resolve/main/data/mls_polish/train/audio/3283_1447_000.tar.gz", "cmu_us_awb_arctic-wav-arctic_a0001.bin": "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_a0001.bin", - "video0.mp4": "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4", - "videos/sample_video.mp4": "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4", + "video0.mp4": "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4" } SAMPLES_PY_DIR = Path( diff --git a/tests/python_tests/samples/test_video_to_text_chat.py b/tests/python_tests/samples/test_video_to_text_chat.py index 0b7c1c2405..9a19c0853f 100644 --- a/tests/python_tests/samples/test_video_to_text_chat.py +++ b/tests/python_tests/samples/test_video_to_text_chat.py @@ -15,7 +15,7 @@ class TestVisualLanguageChat: @pytest.mark.parametrize( "convert_model, download_test_content, questions", [ - pytest.param("tiny-random-llava-next-video", "videos/sample_video.mp4", 'What is unusual on this video?\nGo on.') + pytest.param("tiny-random-llava-next-video", "video0.mp4", 'What is unusual on this video?\nGo on.') ], indirect=["convert_model", "download_test_content"], ) From 40ac708c17143a887c0e06e605488421c23dd342 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 1 Dec 2025 14:10:10 +0100 Subject: [PATCH 24/50] Set WITH_FFMPEG. --- samples/cpp/visual_language_chat/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 0126aa8369..a8e70e0edd 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -5,8 +5,6 @@ if (MSVC) set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") endif() -set(BUILD_SHARED_LIBS ON) - find_package(OpenVINOGenAI REQUIRED PATHS "${CMAKE_BINARY_DIR}" # Reuse the package from the build. @@ -64,6 +62,8 @@ install(TARGETS benchmark_vlm include(FetchContent) +set(BUILD_SHARED_LIBS ON) +set(WITH_FFMPEG ON) FetchContent_Declare( opencv GIT_REPOSITORY https://github.com/opencv/opencv.git From ace9a6a8056b4656979c360799e08ed622d77084 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 1 Dec 2025 15:06:14 +0100 Subject: [PATCH 25/50] Temporarily remove launching of cpp sample. --- tests/python_tests/samples/test_video_to_text_chat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/python_tests/samples/test_video_to_text_chat.py b/tests/python_tests/samples/test_video_to_text_chat.py index 9a19c0853f..3f572423df 100644 --- a/tests/python_tests/samples/test_video_to_text_chat.py +++ b/tests/python_tests/samples/test_video_to_text_chat.py @@ -21,9 +21,9 @@ class TestVisualLanguageChat: ) def test_sample_visual_language_chat(self, convert_model, download_test_content, questions): # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'video_to_text_chat') - cpp_command = [cpp_sample, convert_model, download_test_content] - cpp_result = run_sample(cpp_command, questions) + # cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'video_to_text_chat') + # cpp_command = [cpp_sample, convert_model, download_test_content] + # cpp_result = run_sample(cpp_command, questions) # Test Python sample py_script = os.path.join(SAMPLES_PY_DIR, "visual_language_chat/video_to_text_chat.py") @@ -31,4 +31,4 @@ def test_sample_visual_language_chat(self, convert_model, download_test_content, py_result = run_sample(py_command, questions) # Compare results - assert py_result.stdout == cpp_result.stdout, f"Results should match" + # assert py_result.stdout == cpp_result.stdout, f"Results should match" From f8a3d0dd4143261335db81dfc2cfbd5b01521c15 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 1 Dec 2025 16:55:04 +0100 Subject: [PATCH 26/50] Returned cpp sample launch. --- samples/cpp/visual_language_chat/CMakeLists.txt | 2 +- tests/python_tests/samples/test_video_to_text_chat.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index a8e70e0edd..34fc16b6ee 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -63,7 +63,7 @@ install(TARGETS benchmark_vlm include(FetchContent) set(BUILD_SHARED_LIBS ON) -set(WITH_FFMPEG ON) +set(WITH_FFMPEG ON CACHE BOOL "" FORCE) FetchContent_Declare( opencv GIT_REPOSITORY https://github.com/opencv/opencv.git diff --git a/tests/python_tests/samples/test_video_to_text_chat.py b/tests/python_tests/samples/test_video_to_text_chat.py index 3f572423df..9a19c0853f 100644 --- a/tests/python_tests/samples/test_video_to_text_chat.py +++ b/tests/python_tests/samples/test_video_to_text_chat.py @@ -21,9 +21,9 @@ class TestVisualLanguageChat: ) def test_sample_visual_language_chat(self, convert_model, download_test_content, questions): # Test CPP sample - # cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'video_to_text_chat') - # cpp_command = [cpp_sample, convert_model, download_test_content] - # cpp_result = run_sample(cpp_command, questions) + cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'video_to_text_chat') + cpp_command = [cpp_sample, convert_model, download_test_content] + cpp_result = run_sample(cpp_command, questions) # Test Python sample py_script = os.path.join(SAMPLES_PY_DIR, "visual_language_chat/video_to_text_chat.py") @@ -31,4 +31,4 @@ def test_sample_visual_language_chat(self, convert_model, download_test_content, py_result = run_sample(py_command, questions) # Compare results - # assert py_result.stdout == cpp_result.stdout, f"Results should match" + assert py_result.stdout == cpp_result.stdout, f"Results should match" From 87c58f60fc7b2056787dcb2fbcb59d9dbba628b8 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Mon, 1 Dec 2025 18:06:37 +0100 Subject: [PATCH 27/50] Add install ffmpeg. --- .github/workflows/linux.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 296105085f..2aee4f5226 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -418,6 +418,7 @@ jobs: - name: Build Samples (Release) if: ${{ 'Release' == matrix.build-type }} run: | + sudo apt install ffmpeg libavformat-dev libavcodec-dev libswscale-dev chmod +x ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh -i ${{ env.INSTALL_DIR }} chmod +x ${{ env.OV_INSTALL_DIR }}/samples/c/build_samples.sh From 5440d420eac19d819aa8341f471a2ec1016cb499 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 09:54:16 +0100 Subject: [PATCH 28/50] Minor correction. --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 2aee4f5226..3276aaaa81 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -418,7 +418,7 @@ jobs: - name: Build Samples (Release) if: ${{ 'Release' == matrix.build-type }} run: | - sudo apt install ffmpeg libavformat-dev libavcodec-dev libswscale-dev + sudo apt update && sudo apt install -y ffmpeg libavformat-dev libavcodec-dev libswscale-dev chmod +x ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh -i ${{ env.INSTALL_DIR }} chmod +x ${{ env.OV_INSTALL_DIR }}/samples/c/build_samples.sh From 96d4fa24b33b6e20a264ec7c2b3de46f96d49033 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 10:25:02 +0100 Subject: [PATCH 29/50] Added libs install needed by ffmpeg. --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 3276aaaa81..5e62d6753b 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -418,7 +418,7 @@ jobs: - name: Build Samples (Release) if: ${{ 'Release' == matrix.build-type }} run: | - sudo apt update && sudo apt install -y ffmpeg libavformat-dev libavcodec-dev libswscale-dev + sudo apt update && sudo apt install -y ffmpeg libgtk2.0-dev pkg-config libavformat-dev libavcodec-dev libswscale-dev libavutil-dev chmod +x ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh -i ${{ env.INSTALL_DIR }} chmod +x ${{ env.OV_INSTALL_DIR }}/samples/c/build_samples.sh From 21127edd461f0ad8d92b03948fb3e927e2f3e9c5 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 10:54:59 +0100 Subject: [PATCH 30/50] Minor correction. --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 5e62d6753b..da1178801f 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -418,7 +418,7 @@ jobs: - name: Build Samples (Release) if: ${{ 'Release' == matrix.build-type }} run: | - sudo apt update && sudo apt install -y ffmpeg libgtk2.0-dev pkg-config libavformat-dev libavcodec-dev libswscale-dev libavutil-dev + sudo apt update && sudo apt install -y libgtk2.0-dev pkg-config ffmpeg libavformat-dev libavcodec-dev libswscale-dev libavutil-dev chmod +x ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh -i ${{ env.INSTALL_DIR }} chmod +x ${{ env.OV_INSTALL_DIR }}/samples/c/build_samples.sh From 95ce2a383d6f38b276180128c5171de2e6ef5183 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 12:30:25 +0100 Subject: [PATCH 31/50] Add debug info. --- samples/cpp/visual_language_chat/video_to_text_chat.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index 984ac72792..26ffd76b99 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -76,6 +76,7 @@ int main(int argc, char* argv[]) try { if (argc < 3 || argc > 4) { throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); } + std::cout << cv::getBuildInformation() << std::endl; std::vector videos = load_videos(argv[2]); From 8f4457e0dd40b7bad1f0274da28b28e8915a7985 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 14:29:12 +0100 Subject: [PATCH 32/50] Attempt to fix. --- .github/workflows/linux.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index da1178801f..49da28c3c2 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -418,7 +418,8 @@ jobs: - name: Build Samples (Release) if: ${{ 'Release' == matrix.build-type }} run: | - sudo apt update && sudo apt install -y libgtk2.0-dev pkg-config ffmpeg libavformat-dev libavcodec-dev libswscale-dev libavutil-dev + apt update && apt install sudo + sudo apt install -y libgtk2.0-dev pkg-config ffmpeg libavformat-dev libavcodec-dev libswscale-dev libavutil-dev chmod +x ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh -i ${{ env.INSTALL_DIR }} chmod +x ${{ env.OV_INSTALL_DIR }}/samples/c/build_samples.sh From b7c8dd20952dc3ea8dce1fdd459c24f97388ac8a Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 15:09:31 +0100 Subject: [PATCH 33/50] Applied comments, removed debug print. --- samples/cpp/visual_language_chat/CMakeLists.txt | 1 - samples/cpp/visual_language_chat/video_to_text_chat.cpp | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 34fc16b6ee..bfac6145d5 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -69,7 +69,6 @@ FetchContent_Declare( GIT_REPOSITORY https://github.com/opencv/opencv.git GIT_TAG 4.11.0 GIT_SHALLOW TRUE - GIT_PROGRESS TRUE ) FetchContent_MakeAvailable(opencv) diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index 26ffd76b99..6bbc9246c7 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -55,7 +55,7 @@ ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames std::vector load_videos(const std::filesystem::path& input_path) { if (input_path.empty() || !fs::exists(input_path)) { - throw std::runtime_error{"Path to images is empty or does not exist."}; + OPENVINO_THROW("Path to images is empty or does not exist."); } if (fs::is_directory(input_path)) { std::set sorted_videos{fs::directory_iterator(input_path), fs::directory_iterator()}; @@ -74,10 +74,8 @@ bool print_subword(std::string&& subword) { int main(int argc, char* argv[]) try { if (argc < 3 || argc > 4) { - throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); + OPENVINO_THROW(std::string{"Usage "} + argv[0] + " "); } - std::cout << cv::getBuildInformation() << std::endl; - std::vector videos = load_videos(argv[2]); // GPU and NPU can be used as well. From e11adbb78902de7c7ad2db08437da8e33796909c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 15:34:24 +0100 Subject: [PATCH 34/50] Attempt to fix. --- .github/workflows/linux.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 49da28c3c2..60853cb156 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -167,6 +167,8 @@ jobs: - name: CMake Build run: | + apt update && apt install sudo + sudo apt install -y libgtk2.0-dev pkg-config ffmpeg libavformat-dev libavcodec-dev libswscale-dev libavutil-dev source ${{ env.OV_INSTALL_DIR }}/setupvars.sh cmake -DOpenVINODeveloperPackage_DIR=${{ env.OV_INSTALL_DIR }}/developer_package/cmake \ From b274aea08e70bd13be260ec2d9c7207fe54503ba Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 16:33:26 +0100 Subject: [PATCH 35/50] Increase timeout. --- .github/workflows/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index f3fc4a9f05..568cc05235 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -100,7 +100,7 @@ jobs: matrix: build-type: [Release, Debug] needs: [ openvino_download ] - timeout-minutes: 45 + timeout-minutes: 80 defaults: run: shell: pwsh From d423f8ea89133d160ff757318eca35980c670dff Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 17:33:43 +0100 Subject: [PATCH 36/50] Removed not needed code. --- .github/workflows/linux.yml | 2 -- samples/cpp/visual_language_chat/CMakeLists.txt | 11 ++--------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 60853cb156..1c6eec46b3 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -420,8 +420,6 @@ jobs: - name: Build Samples (Release) if: ${{ 'Release' == matrix.build-type }} run: | - apt update && apt install sudo - sudo apt install -y libgtk2.0-dev pkg-config ffmpeg libavformat-dev libavcodec-dev libswscale-dev libavutil-dev chmod +x ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh ${{ env.OV_INSTALL_DIR }}/samples/cpp/build_samples.sh -i ${{ env.INSTALL_DIR }} chmod +x ${{ env.OV_INSTALL_DIR }}/samples/c/build_samples.sh diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index bfac6145d5..842d63b005 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -62,8 +62,9 @@ install(TARGETS benchmark_vlm include(FetchContent) + set(BUILD_SHARED_LIBS ON) -set(WITH_FFMPEG ON CACHE BOOL "" FORCE) +set(WITH_FFMPEG ON) FetchContent_Declare( opencv GIT_REPOSITORY https://github.com/opencv/opencv.git @@ -83,8 +84,6 @@ target_include_directories(video_to_text_chat PRIVATE target_link_libraries(video_to_text_chat opencv_core opencv_videoio openvino::genai cxxopts::cxxopts) set_target_properties(video_to_text_chat PROPERTIES - INSTALL_RPATH "\$ORIGIN/lib" - BUILD_RPATH "\$ORIGIN/lib" # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) @@ -92,9 +91,3 @@ install(TARGETS video_to_text_chat RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) - - -install(DIRECTORY ${opencv_BINARY_DIR}/lib/ - DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) From 931a072bf478e71639263a63f2b396c0ed5ddaa7 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 2 Dec 2025 19:18:05 +0100 Subject: [PATCH 37/50] Increase timeout. --- .github/workflows/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 568cc05235..d8bef0ae20 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -487,7 +487,7 @@ jobs: matrix: build-type: [Release, Debug] needs: [ openvino_download, genai_build_cpack ] - timeout-minutes: 30 + timeout-minutes: 50 defaults: run: shell: pwsh From e9fd0ce0cd135d84e6ad0b9a1164937ff031d467 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 3 Dec 2025 09:14:47 +0100 Subject: [PATCH 38/50] Update samples/cpp/visual_language_chat/video_to_text_chat.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- samples/cpp/visual_language_chat/video_to_text_chat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index 6bbc9246c7..392fb3e7c8 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -55,7 +55,7 @@ ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames std::vector load_videos(const std::filesystem::path& input_path) { if (input_path.empty() || !fs::exists(input_path)) { - OPENVINO_THROW("Path to images is empty or does not exist."); + OPENVINO_THROW("Path to videos is empty or does not exist."); } if (fs::is_directory(input_path)) { std::set sorted_videos{fs::directory_iterator(input_path), fs::directory_iterator()}; From 2883d5adedb4acb986f5b4517bab5f124f3fb7a5 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 3 Dec 2025 09:43:45 +0100 Subject: [PATCH 39/50] Increase timeout for building win samples. --- .github/workflows/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index d8bef0ae20..839526520a 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -487,7 +487,7 @@ jobs: matrix: build-type: [Release, Debug] needs: [ openvino_download, genai_build_cpack ] - timeout-minutes: 50 + timeout-minutes: 70 defaults: run: shell: pwsh From d839bd7c29cd86b6acd59b47b6a1cfe925e56fe8 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 4 Dec 2025 17:27:54 +0100 Subject: [PATCH 40/50] Set rpath. --- .github/workflows/linux.yml | 2 +- samples/cpp/visual_language_chat/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 1c6eec46b3..67b466d1d8 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -725,7 +725,7 @@ jobs: if: ${{ matrix.test.run_condition }} run: python -m pytest -vvs ${{ env.SRC_DIR }}/${{ matrix.test.cmd }} -m "${{ matrix.test.marker }}" env: - LD_LIBRARY_PATH: "${{ env.INSTALL_DIR }}/lib:${{ env.INSTALL_DIR }}/runtime/lib/intel64:${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb/lib:$LD_LIBRARY_PATH" # Required for C++ samples + LD_LIBRARY_PATH: "${{ env.INSTALL_DIR }}/runtime/lib/intel64:${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb/lib:$LD_LIBRARY_PATH" # Required for C++ samples SAMPLES_PY_DIR: "${{ env.INSTALL_DIR }}/samples/python" SAMPLES_JS_DIR: "${{ env.SRC_DIR }}/samples/js" SAMPLES_CPP_DIR: "${{ env.INSTALL_DIR }}/samples_bin" diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 842d63b005..93ab147137 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -84,6 +84,7 @@ target_include_directories(video_to_text_chat PRIVATE target_link_libraries(video_to_text_chat opencv_core opencv_videoio openvino::genai cxxopts::cxxopts) set_target_properties(video_to_text_chat PROPERTIES + INSTALL_RPATH "$ORIGIN/../lib" # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) From 83fcf65691b26d62fe67a6838049425365e23a69 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 4 Dec 2025 17:10:54 +0100 Subject: [PATCH 41/50] Update .github/workflows/linux.yml Co-authored-by: Vladimir Zlobin --- .github/workflows/linux.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 67b466d1d8..f90b84b555 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -167,8 +167,7 @@ jobs: - name: CMake Build run: | - apt update && apt install sudo - sudo apt install -y libgtk2.0-dev pkg-config ffmpeg libavformat-dev libavcodec-dev libswscale-dev libavutil-dev + apt install -y libgtk2.0-dev pkg-config ffmpeg libavformat-dev libavcodec-dev libswscale-dev libavutil-dev source ${{ env.OV_INSTALL_DIR }}/setupvars.sh cmake -DOpenVINODeveloperPackage_DIR=${{ env.OV_INSTALL_DIR }}/developer_package/cmake \ From ecdadb5e93ead951dbdfabda8ec95a55a6020642 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 4 Dec 2025 17:12:10 +0100 Subject: [PATCH 42/50] Update samples/cpp/visual_language_chat/video_to_text_chat.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- samples/cpp/visual_language_chat/video_to_text_chat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/visual_language_chat/video_to_text_chat.cpp b/samples/cpp/visual_language_chat/video_to_text_chat.cpp index 392fb3e7c8..b41e72a622 100644 --- a/samples/cpp/visual_language_chat/video_to_text_chat.cpp +++ b/samples/cpp/visual_language_chat/video_to_text_chat.cpp @@ -48,7 +48,7 @@ ov::Tensor load_video(const std::filesystem::path& video_path, size_t num_frames } frame_idx++; } - OPENVINO_ASSERT(frame_idx == total_num_frames); + OPENVINO_ASSERT(frame_idx == total_num_frames, "Frame count mismatch: expected " + std::to_string(total_num_frames) + ", got " + std::to_string(frame_idx)); return video_tensor; } From 5428194f1e0b0e75a8c387fa5f240d4b74a00f22 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 4 Dec 2025 17:40:22 +0100 Subject: [PATCH 43/50] Return apt update. --- .github/workflows/linux.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index f90b84b555..2ae8238f70 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -167,6 +167,7 @@ jobs: - name: CMake Build run: | + apt update apt install -y libgtk2.0-dev pkg-config ffmpeg libavformat-dev libavcodec-dev libswscale-dev libavutil-dev source ${{ env.OV_INSTALL_DIR }}/setupvars.sh From a8700326104f0735c4e8cdeaf10e02d0b8f68cb0 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 4 Dec 2025 17:53:26 +0100 Subject: [PATCH 44/50] Align samples. --- .../python/visual_language_chat/video_to_text_chat.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/python/visual_language_chat/video_to_text_chat.py b/samples/python/visual_language_chat/video_to_text_chat.py index 0c1cb76299..0e960d726c 100644 --- a/samples/python/visual_language_chat/video_to_text_chat.py +++ b/samples/python/visual_language_chat/video_to_text_chat.py @@ -39,16 +39,16 @@ def read_video(path: str, num_frames: int = 8) -> Tensor: cap = cv2.VideoCapture(path) frames = [] + indices = np.arange(0, len(frames), len(frames) / num_frames).astype(int) + idx = 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break - - frames.append(np.array(frame)) - - indices = np.arange(0, len(frames), len(frames) / num_frames).astype(int) - frames = [frames[i] for i in indices] + idx++ + if idx in indices: + frames.append(np.array(frame)) return Tensor(frames) From 7939160ee9e4b74b8b0679ad81c5faff5bac489b Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 4 Dec 2025 18:14:18 +0100 Subject: [PATCH 45/50] Minor fix. --- samples/python/visual_language_chat/video_to_text_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/python/visual_language_chat/video_to_text_chat.py b/samples/python/visual_language_chat/video_to_text_chat.py index 0e960d726c..4b41ef314a 100644 --- a/samples/python/visual_language_chat/video_to_text_chat.py +++ b/samples/python/visual_language_chat/video_to_text_chat.py @@ -46,9 +46,9 @@ def read_video(path: str, num_frames: int = 8) -> Tensor: ret, frame = cap.read() if not ret: break - idx++ if idx in indices: frames.append(np.array(frame)) + idx++ return Tensor(frames) From 86d2e5888cec80a16d023ffdbeaa9dd3db81a87f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 4 Dec 2025 19:34:47 +0100 Subject: [PATCH 46/50] Attempt to fix not found lib. --- samples/cpp/visual_language_chat/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 93ab147137..360412b711 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -79,9 +79,10 @@ add_executable(video_to_text_chat video_to_text_chat.cpp) target_include_directories(video_to_text_chat PRIVATE ${OPENCV_CONFIG_FILE_INCLUDE_DIR} ${OPENCV_MODULE_opencv_core_LOCATION}/include + ${OPENCV_MODULE_opencv_imgcodecs}/include ${OPENCV_MODULE_opencv_videoio_LOCATION}/include ) -target_link_libraries(video_to_text_chat opencv_core opencv_videoio openvino::genai cxxopts::cxxopts) +target_link_libraries(video_to_text_chat opencv_core opencv_imgcodecs opencv_videoio openvino::genai cxxopts::cxxopts) set_target_properties(video_to_text_chat PROPERTIES INSTALL_RPATH "$ORIGIN/../lib" From 64afe2489e271710d7550550ae2404969d6dd23c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 4 Dec 2025 20:30:17 +0100 Subject: [PATCH 47/50] Fix typo. --- samples/cpp/visual_language_chat/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 360412b711..fc6858b1ea 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -79,7 +79,7 @@ add_executable(video_to_text_chat video_to_text_chat.cpp) target_include_directories(video_to_text_chat PRIVATE ${OPENCV_CONFIG_FILE_INCLUDE_DIR} ${OPENCV_MODULE_opencv_core_LOCATION}/include - ${OPENCV_MODULE_opencv_imgcodecs}/include + ${OPENCV_MODULE_opencv_imgcodecs_LOCATION}/include ${OPENCV_MODULE_opencv_videoio_LOCATION}/include ) target_link_libraries(video_to_text_chat opencv_core opencv_imgcodecs opencv_videoio openvino::genai cxxopts::cxxopts) From f162f9ec62164eeb14c6079d093f7f5373bad1a1 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 5 Dec 2025 13:43:29 +0100 Subject: [PATCH 48/50] Attempt to fix. --- samples/cpp/visual_language_chat/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index fc6858b1ea..daca6cd372 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -85,7 +85,7 @@ target_include_directories(video_to_text_chat PRIVATE target_link_libraries(video_to_text_chat opencv_core opencv_imgcodecs opencv_videoio openvino::genai cxxopts::cxxopts) set_target_properties(video_to_text_chat PROPERTIES - INSTALL_RPATH "$ORIGIN/../lib" + INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib" # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) From 7021d4d7297d6e92eac3c83f1b2830faf291b683 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 5 Dec 2025 17:15:13 +0100 Subject: [PATCH 49/50] Fix rpath. --- .../cpp/visual_language_chat/CMakeLists.txt | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index daca6cd372..37a14faef0 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -79,15 +79,21 @@ add_executable(video_to_text_chat video_to_text_chat.cpp) target_include_directories(video_to_text_chat PRIVATE ${OPENCV_CONFIG_FILE_INCLUDE_DIR} ${OPENCV_MODULE_opencv_core_LOCATION}/include - ${OPENCV_MODULE_opencv_imgcodecs_LOCATION}/include ${OPENCV_MODULE_opencv_videoio_LOCATION}/include ) -target_link_libraries(video_to_text_chat opencv_core opencv_imgcodecs opencv_videoio openvino::genai cxxopts::cxxopts) - -set_target_properties(video_to_text_chat PROPERTIES - INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib" - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) +target_link_libraries(video_to_text_chat opencv_core opencv_imgcodecs opencv_imgproc opencv_videoio openvino::genai cxxopts::cxxopts) + +if(LINUX) + set_target_properties(video_to_text_chat opencv_core opencv_imgcodecs opencv_imgproc opencv_videoio PROPERTIES + INSTALL_RPATH "$ORIGIN/../lib" + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +elseif(APPLE) + set_target_properties(video_to_text_chat opencv_core opencv_imgcodecs opencv_imgproc opencv_videoio PROPERTIES + INSTALL_RPATH "@loader_path/../lib" + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +endif() install(TARGETS video_to_text_chat RUNTIME DESTINATION samples_bin/ From cd2a1474f03577334304b9b0d66c15fe1ce216bf Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 5 Dec 2025 20:18:48 +0100 Subject: [PATCH 50/50] Align samples. --- samples/python/visual_language_chat/video_to_text_chat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/samples/python/visual_language_chat/video_to_text_chat.py b/samples/python/visual_language_chat/video_to_text_chat.py index 4b41ef314a..b21a83bbb1 100644 --- a/samples/python/visual_language_chat/video_to_text_chat.py +++ b/samples/python/visual_language_chat/video_to_text_chat.py @@ -39,7 +39,8 @@ def read_video(path: str, num_frames: int = 8) -> Tensor: cap = cv2.VideoCapture(path) frames = [] - indices = np.arange(0, len(frames), len(frames) / num_frames).astype(int) + total_num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(int) idx = 0 while cap.isOpened(): @@ -48,7 +49,8 @@ def read_video(path: str, num_frames: int = 8) -> Tensor: break if idx in indices: frames.append(np.array(frame)) - idx++ + idx+=1 + assert idx == total_num_frames, "Frame count mismatch: expected {}, got {}".format(total_num_frames, idx) return Tensor(frames)