{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example for Qwen3-VL-Embedding using vLLM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Import Packages and Prepare Utility Functions" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2026-01-16T07:47:13.726161Z", "iopub.status.busy": "2026-01-16T07:47:13.726038Z", "iopub.status.idle": "2026-01-16T07:47:29.047702Z", "shell.execute_reply": "2026-01-16T07:47:29.046615Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/cpfs01/user/linqi.lmx/envs/vllm014n/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import numpy as np\n", "import os\n", "from typing import List, Dict, Any\n", "from vllm import LLM\n", "from vllm.multimodal.utils import fetch_image\n", "from PIL import Image\n", "\n", "def format_input_to_conversation(\n", " input_dict: Dict[str, Any], \n", " default_instruction: str = \"Represent the user's input.\"\n", ") -> List[Dict]:\n", " content = []\n", " \n", " instruction = input_dict.get('instruction') or default_instruction\n", " text = input_dict.get('text')\n", " image = input_dict.get('image')\n", " \n", " if image:\n", " image_content = None\n", " if isinstance(image, str):\n", " if image.startswith(('http://', 'https://')):\n", " image_content = image\n", " else:\n", " abs_image_path = os.path.abspath(image)\n", " image_content = 'file://' + abs_image_path\n", " else:\n", " image_content = image\n", " \n", " if image_content:\n", " content.append({\n", " 'type': 'image', \n", " 'image': image_content,\n", " })\n", " \n", " if text:\n", " content.append({'type': 'text', 'text': text})\n", " \n", " if not content:\n", " content.append({'type': 'text', 'text': \"\"})\n", " \n", " conversation = [\n", " {\"role\": \"system\", \"content\": [{\"type\": \"text\", \"text\": instruction}]},\n", " {\"role\": \"user\", \"content\": content}\n", " ]\n", " \n", " return conversation\n", "\n", "def prepare_vllm_inputs(\n", " input_dict: Dict[str, Any], \n", " llm, \n", ") -> Dict[str, Any]:\n", " conversation = format_input_to_conversation(input_dict)\n", " \n", " prompt_text = llm.llm_engine.tokenizer.apply_chat_template(\n", " conversation, \n", " tokenize=False, \n", " add_generation_prompt=True\n", " )\n", " \n", " multi_modal_data = None\n", " image = input_dict.get('image')\n", " if image:\n", " if isinstance(image, str):\n", " if image.startswith(('http://', 'https://')):\n", " try:\n", " image_obj = fetch_image(image)\n", " multi_modal_data = {\"image\": image_obj}\n", " except Exception as e:\n", " print(f\"Warning: Failed to fetch image {image}: {e}\")\n", " else:\n", " abs_image_path = os.path.abspath(image)\n", " if os.path.exists(abs_image_path):\n", " image_obj = Image.open(abs_image_path)\n", " multi_modal_data = {\"image\": image_obj}\n", " else:\n", " print(f\"Warning: Image file not found: {abs_image_path}\")\n", " else:\n", " multi_modal_data = {\"image\": image}\n", " \n", " result = {\n", " \"prompt\": prompt_text,\n", " \"multi_modal_data\": multi_modal_data\n", " }\n", " return result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Convert Inputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2026-01-16T07:47:29.050333Z", "iopub.status.busy": "2026-01-16T07:47:29.050112Z", "iopub.status.idle": "2026-01-16T07:48:21.988074Z", "shell.execute_reply": "2026-01-16T07:48:21.987158Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:29 [utils.py:267] non-default args: {'runner': 'pooling', 'trust_remote_code': True, 'dtype': 'bfloat16', 'disable_log_stats': True, 'model': '/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B'}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:29 [model.py:859] Resolved `--convert auto` to `--convert embed`. Pass the value explicitly to silence this message.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:29 [model.py:530] Resolved architecture: Qwen3VLForConditionalGeneration\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:29 [model.py:1547] Using max model len 262144\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026-01-16 07:47:30,017\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:30 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=16384.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:30 [vllm.py:618] Asynchronous scheduling is enabled.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:30 [vllm.py:625] Disabling NCCL for DP synchronization when using async scheduling.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "WARNING 01-16 07:47:30 [vllm.py:732] Pooling models do not support full cudagraphs. Overriding cudagraph_mode to PIECEWISE.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:33 [core.py:96] Initializing a V1 LLM engine (v0.14.0rc2.dev90+gbcf2333cd) with config: model='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B', speculative_config=None, tokenizer='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=PoolerConfig(pooling_type=None, seq_pooling_type='LAST', tok_pooling_type='ALL', normalize=None, dimensions=None, enable_chunked_processing=None, max_embed_len=None, softmax=None, activation=None, use_activation=None, logit_bias=None, step_tag_id=None, returned_token_ids=None), compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [16384], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': True}, 'local_cache_dir': None}\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:33 [parallel_state.py:1212] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.78.15.105:38423 backend=nccl\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:33 [parallel_state.py:1423] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:43 [gpu_model_runner.py:3803] Starting to load model /cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:44 [mm_encoder_attention.py:86] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 07:47:44 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m " ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r\n", "Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00system\n", "Retrieve images or text relevant to the user's query.<|im_end|>\n", "<|im_start|>user\n", "A woman playing with her dog on a beach at sunset.<|im_end|>\n", "<|im_start|>assistant\n", "...\n" ] } ], "source": [ "# Initialize model\n", "llm = LLM(\n", " model=\"Qwen/Qwen3-VL-Embedding-2B\",\n", " runner=\"pooling\",\n", " dtype='bfloat16',\n", " trust_remote_code=True,\n", ")\n", "\n", "# Prepare input samples\n", "inputs = [\n", " {\n", " \"text\": \"A woman playing with her dog on a beach at sunset.\",\n", " \"instruction\": \"Retrieve images or text relevant to the user's query.\",\n", " },\n", " {\n", " \"text\": \"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.\"\n", " },\n", " {\n", " \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\"\n", " },\n", " {\n", " \"text\": \"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.\",\n", " \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\"\n", " }\n", "]\n", "\n", "print(f\"Prepared {len(inputs)} input samples\")\n", "\n", "# Convert to vLLM format\n", "vllm_inputs = [prepare_vllm_inputs(inp, llm) for inp in inputs]\n", "\n", "print(\"Input conversion completed!\")\n", "print(f\"\\nPreview of the first input prompt:\")\n", "print(vllm_inputs[0][\"prompt\"][:200] + \"...\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Get Embeddings" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2026-01-16T07:48:21.990386Z", "iopub.status.busy": "2026-01-16T07:48:21.989851Z", "iopub.status.idle": "2026-01-16T07:48:24.738882Z", "shell.execute_reply": "2026-01-16T07:48:24.737987Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\r\n", "Adding requests: 0%| | 0/4 [00:00