{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example for Qwen3-VL-Reranker using vLLM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Import Packages and Prepare Utility Functions" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2026-01-16T16:15:02.083436Z", "iopub.status.busy": "2026-01-16T16:15:02.083319Z", "iopub.status.idle": "2026-01-16T16:15:13.491057Z", "shell.execute_reply": "2026-01-16T16:15:13.490303Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/cpfs01/user/linqi.lmx/envs/vllm014n/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import os\n", "from pathlib import Path\n", "from typing import Dict, Any\n", "from jinja2 import Template\n", "from vllm import LLM\n", "from vllm.multimodal.utils import fetch_image\n", "\n", "def parse_input_dict(input_dict: Dict[str, Any]):\n", " \"\"\"\n", " Parse input dictionary to extract image and text content.\n", " Returns the formatted content string and multimodal data.\n", " \"\"\"\n", " image = input_dict.get('image')\n", " text = input_dict.get('text')\n", "\n", " mm_data = {\n", " 'image': []\n", " }\n", " content = ''\n", " if image:\n", " content += '<|vision_start|><|image_pad|><|vision_end|>'\n", " if isinstance(image, str):\n", " if image.startswith(('http://', 'https://')):\n", " try:\n", " image_obj = fetch_image(image)\n", " mm_data['image'].append(image_obj)\n", " except Exception as e:\n", " print(f\"Warning: Failed to fetch image {image}: {e}\")\n", " else:\n", " abs_image_path = os.path.abspath(image)\n", " if os.path.exists(abs_image_path):\n", " from PIL import Image\n", " image_obj = Image.open(abs_image_path)\n", " mm_data['image'].append(image_obj)\n", " else:\n", " print(f\"Warning: Image file not found: {abs_image_path}\")\n", " else:\n", " mm_data['image'].append(image)\n", " \n", " if text:\n", " content += text\n", " \n", " return content, mm_data\n", "\n", "def format_vllm_input(\n", " query_dict: Dict[str, Any],\n", " doc_dict: Dict[str, Any],\n", " chat_template: str\n", "):\n", " \"\"\"\n", " Format query and document into vLLM input format.\n", " Combines multimodal data from both query and document.\n", " \"\"\"\n", " query_content, query_mm_data = parse_input_dict(query_dict)\n", " doc_content, doc_mm_data = parse_input_dict(doc_dict)\n", "\n", " mm_data = { 'image': [] }\n", " mm_data['image'].extend(query_mm_data['image'])\n", " mm_data['image'].extend(doc_mm_data['image'])\n", "\n", " prompt = Template(chat_template).render(\n", " query_content=query_content,\n", " doc_content=doc_content,\n", " )\n", " return {\n", " 'prompt': prompt,\n", " 'multi_modal_data': mm_data\n", " }\n", "\n", "def get_rank_scores(\n", " llm,\n", " inputs: Dict[str, Any],\n", " default_instruction: str = \"Given a search query, retrieve relevant candidates that answer the query.\",\n", " template_path: str = \"reranker_template.jinja\"\n", "):\n", " \"\"\"\n", " Generate relevance scores for documents given a query.\n", " Returns a list of scores for each document.\n", " \"\"\"\n", " query_dict = inputs['query']\n", " doc_dicts = inputs['documents']\n", " instruction = inputs.get('instruction') or default_instruction\n", "\n", " chat_template = Template(Path(template_path).read_text())\n", " chat_template = chat_template.render(instruction=instruction)\n", "\n", " prompts = []\n", "\n", " for doc_dict in doc_dicts:\n", " prompt = format_vllm_input(\n", " query_dict, doc_dict, chat_template\n", " )\n", " prompts.append(prompt)\n", "\n", " outputs = llm.classify(\n", " prompts=prompts\n", " )\n", " scores = [ output.outputs.probs[0] for output in outputs ]\n", " return scores" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Initialize Model" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2026-01-16T16:15:13.492787Z", "iopub.status.busy": "2026-01-16T16:15:13.492613Z", "iopub.status.idle": "2026-01-16T16:16:03.427201Z", "shell.execute_reply": "2026-01-16T16:16:03.426478Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:13 [utils.py:267] non-default args: {'runner': 'pooling', 'trust_remote_code': True, 'dtype': 'bfloat16', 'disable_log_stats': True, 'hf_overrides': {'architectures': ['Qwen3VLForSequenceClassification'], 'classifier_from_token': ['no', 'yes'], 'is_original_qwen3_reranker': True}, 'model': '/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B'}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:13 [model.py:859] Resolved `--convert auto` to `--convert classify`. Pass the value explicitly to silence this message.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:13 [model.py:530] Resolved architecture: Qwen3VLForSequenceClassification\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:13 [model.py:1547] Using max model len 262144\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026-01-16 16:15:14,132\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:14 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=16384.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:14 [vllm.py:618] Asynchronous scheduling is enabled.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:14 [vllm.py:625] Disabling NCCL for DP synchronization when using async scheduling.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "WARNING 01-16 16:15:14 [vllm.py:732] Pooling models do not support full cudagraphs. Overriding cudagraph_mode to PIECEWISE.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:17 [core.py:96] Initializing a V1 LLM engine (v0.14.0rc2.dev90+gbcf2333cd) with config: model='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B', speculative_config=None, tokenizer='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=PoolerConfig(pooling_type=None, seq_pooling_type='LAST', tok_pooling_type='ALL', normalize=None, dimensions=None, enable_chunked_processing=None, max_embed_len=None, softmax=None, activation=None, use_activation=None, logit_bias=None, step_tag_id=None, returned_token_ids=None), compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [16384], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': True}, 'local_cache_dir': None}\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:18 [parallel_state.py:1212] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.78.15.105:46275 backend=nccl\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:18 [parallel_state.py:1423] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n", "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:27 [gpu_model_runner.py:3803] Starting to load model /cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:28 [mm_encoder_attention.py:86] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m " ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO 01-16 16:15:28 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m " ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00