{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example for Qwen3-VL-Reranker using vLLM"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Import Packages and Prepare Utility Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-01-16T16:15:02.083436Z",
     "iopub.status.busy": "2026-01-16T16:15:02.083319Z",
     "iopub.status.idle": "2026-01-16T16:15:13.491057Z",
     "shell.execute_reply": "2026-01-16T16:15:13.490303Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/cpfs01/user/linqi.lmx/envs/vllm014n/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from pathlib import Path\n",
    "from typing import Dict, Any\n",
    "from jinja2 import Template\n",
    "from vllm import LLM\n",
    "from vllm.multimodal.utils import fetch_image\n",
    "\n",
    "def parse_input_dict(input_dict: Dict[str, Any]):\n",
    "    \"\"\"\n",
    "    Parse input dictionary to extract image and text content.\n",
    "    Returns the formatted content string and multimodal data.\n",
    "    \"\"\"\n",
    "    image = input_dict.get('image')\n",
    "    text = input_dict.get('text')\n",
    "\n",
    "    mm_data = {\n",
    "        'image': []\n",
    "    }\n",
    "    content = ''\n",
    "    if image:\n",
    "        content += '<|vision_start|><|image_pad|><|vision_end|>'\n",
    "        if isinstance(image, str):\n",
    "            if image.startswith(('http://', 'https://')):\n",
    "                try:\n",
    "                    image_obj = fetch_image(image)\n",
    "                    mm_data['image'].append(image_obj)\n",
    "                except Exception as e:\n",
    "                    print(f\"Warning: Failed to fetch image {image}: {e}\")\n",
    "            else:\n",
    "                abs_image_path = os.path.abspath(image)\n",
    "                if os.path.exists(abs_image_path):\n",
    "                    from PIL import Image\n",
    "                    image_obj = Image.open(abs_image_path)\n",
    "                    mm_data['image'].append(image_obj)\n",
    "                else:\n",
    "                    print(f\"Warning: Image file not found: {abs_image_path}\")\n",
    "        else:\n",
    "            mm_data['image'].append(image)\n",
    "    \n",
    "    if text:\n",
    "        content += text\n",
    "    \n",
    "    return content, mm_data\n",
    "\n",
    "def format_vllm_input(\n",
    "    query_dict: Dict[str, Any],\n",
    "    doc_dict: Dict[str, Any],\n",
    "    chat_template: str\n",
    "):\n",
    "    \"\"\"\n",
    "    Format query and document into vLLM input format.\n",
    "    Combines multimodal data from both query and document.\n",
    "    \"\"\"\n",
    "    query_content, query_mm_data = parse_input_dict(query_dict)\n",
    "    doc_content, doc_mm_data = parse_input_dict(doc_dict)\n",
    "\n",
    "    mm_data = { 'image': [] }\n",
    "    mm_data['image'].extend(query_mm_data['image'])\n",
    "    mm_data['image'].extend(doc_mm_data['image'])\n",
    "\n",
    "    prompt = Template(chat_template).render(\n",
    "        query_content=query_content,\n",
    "        doc_content=doc_content,\n",
    "    )\n",
    "    return {\n",
    "        'prompt': prompt,\n",
    "        'multi_modal_data': mm_data\n",
    "    }\n",
    "\n",
    "def get_rank_scores(\n",
    "    llm,\n",
    "    inputs: Dict[str, Any],\n",
    "    default_instruction: str = \"Given a search query, retrieve relevant candidates that answer the query.\",\n",
    "    template_path: str = \"reranker_template.jinja\"\n",
    "):\n",
    "    \"\"\"\n",
    "    Generate relevance scores for documents given a query.\n",
    "    Returns a list of scores for each document.\n",
    "    \"\"\"\n",
    "    query_dict = inputs['query']\n",
    "    doc_dicts = inputs['documents']\n",
    "    instruction = inputs.get('instruction') or default_instruction\n",
    "\n",
    "    chat_template = Template(Path(template_path).read_text())\n",
    "    chat_template = chat_template.render(instruction=instruction)\n",
    "\n",
    "    prompts = []\n",
    "\n",
    "    for doc_dict in doc_dicts:\n",
    "        prompt = format_vllm_input(\n",
    "            query_dict, doc_dict, chat_template\n",
    "        )\n",
    "        prompts.append(prompt)\n",
    "\n",
    "    outputs = llm.classify(\n",
    "        prompts=prompts\n",
    "    )\n",
    "    scores = [ output.outputs.probs[0] for output in outputs ]\n",
    "    return scores"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Initialize Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-01-16T16:15:13.492787Z",
     "iopub.status.busy": "2026-01-16T16:15:13.492613Z",
     "iopub.status.idle": "2026-01-16T16:16:03.427201Z",
     "shell.execute_reply": "2026-01-16T16:16:03.426478Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:13 [utils.py:267] non-default args: {'runner': 'pooling', 'trust_remote_code': True, 'dtype': 'bfloat16', 'disable_log_stats': True, 'hf_overrides': {'architectures': ['Qwen3VLForSequenceClassification'], 'classifier_from_token': ['no', 'yes'], 'is_original_qwen3_reranker': True}, 'model': '/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:13 [model.py:859] Resolved `--convert auto` to `--convert classify`. Pass the value explicitly to silence this message.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:13 [model.py:530] Resolved architecture: Qwen3VLForSequenceClassification\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:13 [model.py:1547] Using max model len 262144\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2026-01-16 16:15:14,132\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:14 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=16384.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:14 [vllm.py:618] Asynchronous scheduling is enabled.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:14 [vllm.py:625] Disabling NCCL for DP synchronization when using async scheduling.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING 01-16 16:15:14 [vllm.py:732] Pooling models do not support full cudagraphs. Overriding cudagraph_mode to PIECEWISE.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:17 [core.py:96] Initializing a V1 LLM engine (v0.14.0rc2.dev90+gbcf2333cd) with config: model='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B', speculative_config=None, tokenizer='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=PoolerConfig(pooling_type=None, seq_pooling_type='LAST', tok_pooling_type='ALL', normalize=None, dimensions=None, enable_chunked_processing=None, max_embed_len=None, softmax=None, activation=None, use_activation=None, logit_bias=None, step_tag_id=None, returned_token_ids=None), compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [16384], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.PIECEWISE: 1>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': True}, 'local_cache_dir': None}\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:18 [parallel_state.py:1212] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.78.15.105:46275 backend=nccl\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:18 [parallel_state.py:1423] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n",
      "[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:27 [gpu_model_runner.py:3803] Starting to load model /cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:28 [mm_encoder_attention.py:86] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:28 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.39it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.39it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:29 [default_loader.py:291] Loading weights took 1.24 seconds\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:30 [gpu_model_runner.py:3900] Model loading took 4.4 GiB memory and 1.838998 seconds\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:30 [gpu_model_runner.py:4711] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 video items of the maximum feature size.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:48 [backends.py:644] Using cache directory: /cpfs01/user/linqi.lmx/.cache/vllm/torch_compile_cache/c988518be9/rank_0_0/backbone for vLLM's torch.compile\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:48 [backends.py:704] Dynamo bytecode transform time: 7.87 s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:52 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 16384) from the cache, took 0.866 s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:52 [monitor.py:34] torch.compile takes 8.74 s in total\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:53 [gpu_worker.py:355] Available KV cache memory: 65.22 GiB\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:53 [kv_cache_utils.py:1307] GPU KV cache size: 610,560 tokens\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:53 [kv_cache_utils.py:1312] Maximum concurrency for 262,144 tokens per request: 2.33x\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2026-01-16 16:15:53,469 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2026-01-16 16:15:53,485 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|                                                        | 0/51 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   2%|▉                                               | 1/51 [00:00<00:11,  4.43it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   6%|██▊                                             | 3/51 [00:00<00:04,  9.89it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  12%|█████▋                                          | 6/51 [00:00<00:02, 15.99it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  16%|███████▌                                        | 8/51 [00:00<00:02, 16.79it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  24%|███████████                                    | 12/51 [00:00<00:01, 22.01it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  29%|█████████████▊                                 | 15/51 [00:00<00:01, 22.91it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  35%|████████████████▌                              | 18/51 [00:00<00:01, 21.15it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  41%|███████████████████▎                           | 21/51 [00:01<00:01, 21.96it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  47%|██████████████████████                         | 24/51 [00:01<00:01, 21.56it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  53%|████████████████████████▉                      | 27/51 [00:01<00:01, 23.44it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  59%|███████████████████████████▋                   | 30/51 [00:01<00:00, 25.06it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  65%|██████████████████████████████▍                | 33/51 [00:01<00:00, 25.86it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  71%|█████████████████████████████████▏             | 36/51 [00:01<00:00, 25.53it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  76%|███████████████████████████████████▉           | 39/51 [00:01<00:00, 25.63it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  82%|██████████████████████████████████████▋        | 42/51 [00:01<00:00, 26.49it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  88%|█████████████████████████████████████████▍     | 45/51 [00:02<00:00, 26.77it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  96%|█████████████████████████████████████████████▏ | 49/51 [00:02<00:00, 28.63it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|███████████████████████████████████████████████| 51/51 [00:02<00:00, 23.35it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:56 [gpu_model_runner.py:4852] Graph capturing finished in 3 secs, took -0.76 GiB\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;36m(EngineCore_DP0 pid=200152)\u001b[0;0m "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:15:56 [core.py:272] init engine (profile, create kv cache, warmup model) took 25.93 seconds\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 01-16 16:16:03 [llm.py:347] Supported tasks: ['classify', 'score', 'token_classify']\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model initialized successfully!\n"
     ]
    }
   ],
   "source": [
    "# Initialize the Qwen3-VL-Reranker model\n",
    "llm = LLM(\n",
    "    model='/cpfs01/user/linqi.lmx/models/finetune/qwen/qwen3-vl/Qwen3-VL-Embedding/Qwen3-VL-Reranker-2B',\n",
    "    runner='pooling',\n",
    "    dtype='bfloat16',\n",
    "    trust_remote_code=True,\n",
    "    hf_overrides={\n",
    "        \"architectures\": [\"Qwen3VLForSequenceClassification\"],\n",
    "        \"classifier_from_token\": [\"no\", \"yes\"],\n",
    "        \"is_original_qwen3_reranker\": True,\n",
    "    },\n",
    ")\n",
    "\n",
    "print(\"Model initialized successfully!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Prepare Input Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-01-16T16:16:03.429327Z",
     "iopub.status.busy": "2026-01-16T16:16:03.428886Z",
     "iopub.status.idle": "2026-01-16T16:16:03.432738Z",
     "shell.execute_reply": "2026-01-16T16:16:03.432293Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prepared query with 3 candidate documents\n"
     ]
    }
   ],
   "source": [
    "# Define query and candidate documents for reranking\n",
    "inputs = {\n",
    "    \"instruction\": \"Retrieve images or text relevant to the user's query.\",\n",
    "    \"query\": {\n",
    "        \"text\": \"A woman playing with her dog on a beach at sunset.\"\n",
    "    },\n",
    "    \"documents\": [\n",
    "        {\n",
    "            \"text\": \"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.\"\n",
    "        },\n",
    "        {\n",
    "            \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\"\n",
    "        },\n",
    "        {\n",
    "            \"text\": \"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.\",\n",
    "            \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\"\n",
    "        }\n",
    "    ]\n",
    "}\n",
    "\n",
    "print(f\"Prepared query with {len(inputs['documents'])} candidate documents\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Generate Relevance Scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-01-16T16:16:03.434072Z",
     "iopub.status.busy": "2026-01-16T16:16:03.433915Z",
     "iopub.status.idle": "2026-01-16T16:16:06.363361Z",
     "shell.execute_reply": "2026-01-16T16:16:06.362689Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Adding requests:   0%|                                                                                                 | 0/3 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Adding requests:  33%|█████████████████████████████▋                                                           | 1/3 [00:02<00:04,  2.40s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Adding requests: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.51it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Adding requests: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.20it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Processed prompts:   0%|                                           | 0/3 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Processed prompts: 100%|███████████████████████████████████| 3/3 [00:00<00:00, 48.83it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "Processed prompts: 100%|███████████████████████████████████| 3/3 [00:00<00:00, 48.48it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Relevance Scores:\n",
      "Document 1: 0.8600\n",
      "Document 2: 0.7232\n",
      "Document 3: 0.8189\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Get relevance scores for each document\n",
    "scores = get_rank_scores(llm, inputs)\n",
    "\n",
    "print(\"Relevance Scores:\")\n",
    "for i, score in enumerate(scores):\n",
    "    print(f\"Document {i+1}: {score:.4f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}