{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example for Qwen3-VL-Embedding using vLLM"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Import Packages and Prepare Utility Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-01-16T07:47:13.726161Z",
     "iopub.status.busy": "2026-01-16T07:47:13.726038Z",
     "iopub.status.idle": "2026-01-16T07:47:29.047702Z",
     "shell.execute_reply": "2026-01-16T07:47:29.046615Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/cpfs01/user/linqi.lmx/envs/vllm014n/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "from typing import List, Dict, Any\n",
    "from vllm import LLM\n",
    "from vllm.multimodal.utils import fetch_image\n",
    "from PIL import Image\n",
    "\n",
    "def format_input_to_conversation(\n",
    "    input_dict: Dict[str, Any], \n",
    "    default_instruction: str = \"Represent the user's input.\"\n",
    ") -> List[Dict]:\n",
    "    content = []\n",
    "    \n",
    "    instruction = input_dict.get('instruction') or default_instruction\n",
    "    text = input_dict.get('text')\n",
    "    image = input_dict.get('image')\n",
    "    \n",
    "    if image:\n",
    "        image_content = None\n",
    "        if isinstance(image, str):\n",
    "            if image.startswith(('http://', 'https://')):\n",
    "                image_content = image\n",
    "            else:\n",
    "                abs_image_path = os.path.abspath(image)\n",
    "                image_content = 'file://' + abs_image_path\n",
    "        else:\n",
    "            image_content = image\n",
    "        \n",
    "        if image_content:\n",
    "            content.append({\n",
    "                'type': 'image', \n",
    "                'image': image_content,\n",
    "            })\n",
    "    \n",
    "    if text:\n",
    "        content.append({'type': 'text', 'text': text})\n",
    "    \n",
    "    if not content:\n",
    "        content.append({'type': 'text', 'text': \"\"})\n",
    "    \n",
    "    conversation = [\n",
    "        {\"role\": \"system\", \"content\": [{\"type\": \"text\", \"text\": instruction}]},\n",
    "        {\"role\": \"user\", \"content\": content}\n",
    "    ]\n",
    "    \n",
    "    return conversation\n",
    "\n",
    "def prepare_vllm_inputs(\n",
    "    input_dict: Dict[str, Any], \n",
    "    llm, \n",
    ") -> Dict[str, Any]:\n",
    "    conversation = format_input_to_conversation(input_dict)\n",
    "    \n",
    "    prompt_text = llm.llm_engine.tokenizer.apply_chat_template(\n",
    "        conversation, \n",
    "        tokenize=False, \n",
    "        add_generation_prompt=True\n",
    "    )\n",
    "    \n",
    "    multi_modal_data = None\n",
    "    image = input_dict.get('image')\n",
    "    if image:\n",
    "        if isinstance(image, str):\n",
    "            if image.startswith(('http://', 'https://')):\n",
    "                try:\n",
    "                    image_obj = fetch_image(image)\n",
    "                    multi_modal_data = {\"image\": image_obj}\n",
    "                except Exception as e:\n",
    "                    print(f\"Warning: Failed to fetch image {image}: {e}\")\n",
    "            else:\n",
    "                abs_image_path = os.path.abspath(image)\n",
    "                if os.path.exists(abs_image_path):\n",
    "                    image_obj = Image.open(abs_image_path)\n",
    "                    multi_modal_data = {\"image\": image_obj}\n",
    "                else:\n",
    "                    print(f\"Warning: Image file not found: {abs_image_path}\")\n",
    "        else:\n",
    "            multi_modal_data = {\"image\": image}\n",
    "    \n",
    "    result = {\n",
    "        \"prompt\": prompt_text,\n",
    "        \"multi_modal_data\": multi_modal_data\n",
    "    }\n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Convert Inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-01-16T07:47:29.050333Z",
     "iopub.status.busy": "2026-01-16T07:47:29.050112Z",
     "iopub.status.idle": "2026-01-16T07:48:21.988074Z",
     "shell.execute_reply": "2026-01-16T07:48:21.987158Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:29 [utils.py:267] non-default args: {'runner': 'pooling', 'trust_remote_code': True, 'dtype': 'bfloat16', 'disable_log_stats': True, 'model': '/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:29 [model.py:859] Resolved `--convert auto` to `--convert embed`. Pass the value explicitly to silence this message.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:29 [model.py:530] Resolved architecture: Qwen3VLForConditionalGeneration\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:29 [model.py:1547] Using max model len 262144\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2026-01-16 07:47:30,017\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:30 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=16384.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:30 [vllm.py:618] Asynchronous scheduling is enabled.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:30 [vllm.py:625] Disabling NCCL for DP synchronization when using async scheduling.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING 01-16 07:47:30 [vllm.py:732] Pooling models do not support full cudagraphs. Overriding cudagraph_mode to PIECEWISE.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:33 [core.py:96] Initializing a V1 LLM engine (v0.14.0rc2.dev90+gbcf2333cd) with config: model='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B', speculative_config=None, tokenizer='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=PoolerConfig(pooling_type=None, seq_pooling_type='LAST', tok_pooling_type='ALL', normalize=None, dimensions=None, enable_chunked_processing=None, max_embed_len=None, softmax=None, activation=None, use_activation=None, logit_bias=None, step_tag_id=None, returned_token_ids=None), compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [16384], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.PIECEWISE: 1>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': True}, 'local_cache_dir': None}\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:33 [parallel_state.py:1212] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.78.15.105:38423 backend=nccl\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:33 [parallel_state.py:1423] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:43 [gpu_model_runner.py:3803] Starting to load model /cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Embedding-2B...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:44 [mm_encoder_attention.py:86] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:44 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.09it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.08it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:46 [default_loader.py:291] Loading weights took 1.06 seconds\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:46 [gpu_model_runner.py:3900] Model loading took 4.4 GiB memory and 2.499499 seconds\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:47:46 [gpu_model_runner.py:4711] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 video items of the maximum feature size.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:06 [backends.py:644] Using cache directory: /cpfs01/user/linqi.lmx/.cache/vllm/torch_compile_cache/86f6b21f85/rank_0_0/backbone for vLLM's torch.compile\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:07 [backends.py:704] Dynamo bytecode transform time: 9.40 s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:10 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 16384) from the cache, took 0.889 s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:10 [monitor.py:34] torch.compile takes 10.29 s in total\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:11 [gpu_worker.py:355] Available KV cache memory: 65.22 GiB\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:11 [kv_cache_utils.py:1307] GPU KV cache size: 610,560 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:11 [kv_cache_utils.py:1312] Maximum concurrency for 262,144 tokens per request: 2.33x\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2026-01-16 07:48:11,729 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2026-01-16 07:48:11,757 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|                                                                                                           | 0/51 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   4%|███▉                                                                                               | 2/51 [00:00<00:03, 13.39it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   8%|███████▊                                                                                           | 4/51 [00:00<00:04, 11.55it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  14%|█████████████▌                                                                                     | 7/51 [00:00<00:02, 16.33it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  22%|█████████████████████▏                                                                            | 11/51 [00:00<00:01, 21.82it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  29%|████████████████████████████▊                                                                     | 15/51 [00:00<00:01, 24.88it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  35%|██████████████████████████████████▌                                                               | 18/51 [00:00<00:01, 24.55it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  41%|████████████████████████████████████████▎                                                         | 21/51 [00:00<00:01, 25.34it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  49%|████████████████████████████████████████████████                                                  | 25/51 [00:01<00:00, 26.92it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  55%|█████████████████████████████████████████████████████▊                                            | 28/51 [00:01<00:00, 26.75it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  61%|███████████████████████████████████████████████████████████▌                                      | 31/51 [00:01<00:00, 24.39it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  67%|█████████████████████████████████████████████████████████████████▎                                | 34/51 [00:01<00:00, 24.80it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  73%|███████████████████████████████████████████████████████████████████████                           | 37/51 [00:01<00:00, 24.48it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  78%|████████████████████████████████████████████████████████████████████████████▊                     | 40/51 [00:01<00:00, 25.07it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  84%|██████████████████████████████████████████████████████████████████████████████████▋               | 43/51 [00:01<00:00, 25.66it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  90%|████████████████████████████████████████████████████████████████████████████████████████▍         | 46/51 [00:01<00:00, 26.76it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  98%|████████████████████████████████████████████████████████████████████████████████████████████████  | 50/51 [00:02<00:00, 28.94it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 51/51 [00:02<00:00, 24.62it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:14 [gpu_model_runner.py:4852] Graph capturing finished in 3 secs, took -0.76 GiB\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=82232)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:14 [core.py:272] init engine (profile, create kv cache, warmup model) took 27.63 seconds\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 07:48:21 [llm.py:347] Supported tasks: ['embed', 'token_embed']\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prepared 4 input samples\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input conversion completed!\n",
      "\n",
      "Preview of the first input prompt:\n",
      "<|im_start|>system\n",
      "Retrieve images or text relevant to the user's query.<|im_end|>\n",
      "<|im_start|>user\n",
      "A woman playing with her dog on a beach at sunset.<|im_end|>\n",
      "<|im_start|>assistant\n",
      "...\n"
     ]
    }
   ],
   "source": [
    "# Initialize model\n",
    "llm = LLM(\n",
    "    model=\"Qwen/Qwen3-VL-Embedding-2B\",\n",
    "    runner=\"pooling\",\n",
    "    dtype='bfloat16',\n",
    "    trust_remote_code=True,\n",
    ")\n",
    "\n",
    "# Prepare input samples\n",
    "inputs = [\n",
    "    {\n",
    "        \"text\": \"A woman playing with her dog on a beach at sunset.\",\n",
    "        \"instruction\": \"Retrieve images or text relevant to the user's query.\",\n",
    "    },\n",
    "    {\n",
    "        \"text\": \"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.\"\n",
    "    },\n",
    "    {\n",
    "        \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\"\n",
    "    },\n",
    "    {\n",
    "        \"text\": \"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.\",\n",
    "        \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\"\n",
    "    }\n",
    "]\n",
    "\n",
    "print(f\"Prepared {len(inputs)} input samples\")\n",
    "\n",
    "# Convert to vLLM format\n",
    "vllm_inputs = [prepare_vllm_inputs(inp, llm) for inp in inputs]\n",
    "\n",
    "print(\"Input conversion completed!\")\n",
    "print(f\"\\nPreview of the first input prompt:\")\n",
    "print(vllm_inputs[0][\"prompt\"][:200] + \"...\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Get Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-01-16T07:48:21.990386Z",
     "iopub.status.busy": "2026-01-16T07:48:21.989851Z",
     "iopub.status.idle": "2026-01-16T07:48:24.738882Z",
     "shell.execute_reply": "2026-01-16T07:48:24.737987Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Adding requests:   0%|                                                                                                                                                    | 0/4 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Adding requests:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 3/4 [00:02<00:00,  1.14it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Adding requests: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.50it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Processed prompts:   0%|                                                                                              | 0/4 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 53.65it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r\n",
      "Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 53.33it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Embeddings shape: (4, 2048)\n",
      "Embedding dimension per sample: 2048\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Generate embeddings\n",
    "outputs = llm.embed(vllm_inputs)\n",
    "\n",
    "# Extract embedding vectors\n",
    "embeddings_list = []\n",
    "for i, output in enumerate(outputs):\n",
    "    emb = output.outputs.embedding\n",
    "    embeddings_list.append(emb)\n",
    "\n",
    "embeddings = np.array(embeddings_list)\n",
    "print(f\"Embeddings shape: {embeddings.shape}\")\n",
    "print(f\"Embedding dimension per sample: {embeddings.shape[1]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Display Similarity Scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-01-16T07:48:24.740732Z",
     "iopub.status.busy": "2026-01-16T07:48:24.740555Z",
     "iopub.status.idle": "2026-01-16T07:48:24.744448Z",
     "shell.execute_reply": "2026-01-16T07:48:24.743982Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Similarity Score Matrix:\n",
      "[[1.00000007 0.78962161 0.71557983 0.72364763]\n",
      " [0.78962161 0.9999999  0.76502916 0.84728952]\n",
      " [0.71557983 0.76502916 0.99999998 0.82388284]\n",
      " [0.72364763 0.84728952 0.82388284 1.00000003]]\n"
     ]
    }
   ],
   "source": [
    "# Calculate similarity matrix\n",
    "similarity_scores = embeddings @ embeddings.T\n",
    "\n",
    "print(\"Similarity Score Matrix:\")\n",
    "print(similarity_scores)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}