{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "FtQYMbLvgzO-"
   },
   "source": [
    "<table style=\"width:100%\">\n",
    "<tr>\n",
    "<td style=\"vertical-align:middle; text-align:left;\">\n",
    "<font size=\"2\">\n",
    "Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
    "<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
    "<br>汉化的库: <a href=\"https://github.com/GoatCsu/CN-LLMs-from-scratch.git\">https://github.com/GoatCsu/CN-LLMs-from-scratch.git</a>\n",
    "</font>\n",
    "</td>\n",
    "<td style=\"vertical-align:middle; text-align:left;\">\n",
    "<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
    "</td>\n",
    "</tr>\n",
    "</table>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EbrESHKtgzPA"
   },
   "source": [
    "# FLOPS分析"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "xS2WjniMgzPB"
   },
   "source": [
    "- FLOPs（每秒浮点运算数）衡量神经网络的计算复杂度。\n",
    "- 高FLOPs计算更加复杂，能耗更高。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "L01-NzkggzPB"
   },
   "outputs": [],
   "source": [
    "# pip install -r requirements-extra.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ObzfVatqgzPC",
    "outputId": "3ead6a41-ac38-4db1-9fc3-012fb3ad18cd"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "thop version: 0.1.1-2209072238\n",
      "torch version: 2.4.1+cu121\n"
     ]
    }
   ],
   "source": [
    "from importlib.metadata import version\n",
    "\n",
    "pkgs = [\n",
    "    \"thop\",\n",
    "    \"torch\",\n",
    "]\n",
    "for p in pkgs:\n",
    "    print(f\"{p} version: {version(p)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "74UpjSLjgzPC"
   },
   "source": [
    "&nbsp;\n",
    "# 固定批次大小的基准测试"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "90pnCK39gzPD"
   },
   "source": [
    "- 仅有前向传播"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "GerIdRMXd6g9",
    "outputId": "177c6d00-a817-40fe-badd-95cfa8ac9b51"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gpt-small (124M)  : 5.1e+11 FLOPS\n",
      "gpt-medium (355M) : 1.4e+12 FLOPS\n",
      "gpt-large (774M)  : 3.2e+12 FLOPS\n",
      "gpt-xl (1558M)    : 6.4e+12 FLOPS\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from thop import profile\n",
    "\n",
    "from previous_chapters import GPTModel\n",
    "\n",
    "# 基本配置\n",
    "BASE_CONFIG = {\n",
    "    \"vocab_size\": 50257,     # 词汇表大小\n",
    "    \"context_length\": 1024,  # 上下文长度\n",
    "    \"drop_rate\": 0.0,        # 丢弃率\n",
    "    \"qkv_bias\": True         # 是否使用查询-键-值偏置\n",
    "}\n",
    "\n",
    "# 不同规模的GPT模型配置\n",
    "model_configs = {\n",
    "    \"gpt-small (124M)\": {\"emb_dim\": 768, \"n_layers\": 12, \"n_heads\": 12},  # 小型模型\n",
    "    \"gpt-medium (355M)\": {\"emb_dim\": 1024, \"n_layers\": 24, \"n_heads\": 16}, # 中型模型\n",
    "    \"gpt-large (774M)\": {\"emb_dim\": 1280, \"n_layers\": 36, \"n_heads\": 20},  # 大型模型\n",
    "    \"gpt-xl (1558M)\": {\"emb_dim\": 1600, \"n_layers\": 48, \"n_heads\": 25},    # 超大模型\n",
    "}\n",
    "\n",
    "# 设置设备（优先使用GPU，如果没有则使用CPU）\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "# 批次大小\n",
    "batch_size = 2\n",
    "# 输入张量：一个大小为(批次大小, 上下文长度)的随机整数张量\n",
    "input_tensor = torch.randint(0, 50257, (batch_size, 1024)).to(device)\n",
    "\n",
    "# 对每个模型配置进行遍历\n",
    "for size in model_configs:\n",
    "    # 更新基础配置\n",
    "    BASE_CONFIG.update(model_configs[size])\n",
    "\n",
    "    # 创建GPT模型并转换为bfloat16类型\n",
    "    model = GPTModel(BASE_CONFIG).bfloat16()\n",
    "    model.to(device)\n",
    "\n",
    "    # MACS（乘加操作）= 浮动计算操作的一种\n",
    "    # MACS通常被认为是两个FLOPS（一个乘法和一个加法）\n",
    "    macs, params = profile(model, inputs=(input_tensor,), verbose=False)\n",
    "    flops = 2*macs  # 计算FLOPS：每个MACS算作2个FLOPS\n",
    "    print(f\"{size:18}: {flops:.1e} FLOPS\")  # 输出FLOPS\n",
    "\n",
    "    # 清除模型并释放GPU缓存\n",
    "    del model\n",
    "    torch.cuda.empty_cache()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_S6V05QmgzPD"
   },
   "source": [
    "&nbsp;\n",
    "# 固定批次大小的简单基准测试"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "amw4E983gzPD"
   },
   "source": [
    "- 仅有前向传播"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "h08VOiqpgzPE",
    "outputId": "a6a90ef8-28fb-4b55-9268-6915b0c84c51"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing gpt-small (124M)\n",
      "  Batch size 256: 6.5e+13 FLOPS\n",
      "  Batch size 384: 9.7e+13 FLOPS\n",
      "  Batch size 388: 9.8e+13 FLOPS\n",
      "  Batch size 389: 9.8e+13 FLOPS\n",
      "\n",
      "Processing gpt-medium (355M)\n",
      "  Batch size 256: 1.9e+14 FLOPS\n",
      "  Batch size 260: 1.9e+14 FLOPS\n",
      "  Batch size 262: 1.9e+14 FLOPS\n",
      "  Batch size 263: 1.9e+14 FLOPS\n",
      "\n",
      "Processing gpt-large (774M)\n",
      "  Batch size 256: 4.0e+14 FLOPS\n",
      "\n",
      "Processing gpt-xl (1558M)\n",
      "  Batch size 128: 4.1e+14 FLOPS\n",
      "  Batch size 136: 4.3e+14 FLOPS\n",
      "  Batch size 140: 4.5e+14 FLOPS\n",
      "  Batch size 142: 4.5e+14 FLOPS\n",
      "  Batch size 143: 4.6e+14 FLOPS\n"
     ]
    }
   ],
   "source": [
    "for size in model_configs:\n",
    "    print(f\"\\n正在处理 {size}\")\n",
    "    config = BASE_CONFIG.copy()  # 复制基础配置\n",
    "    config.update(model_configs[size])  # 更新配置为对应模型的配置\n",
    "\n",
    "    min_batch_size = 1  # 最小批次大小\n",
    "    max_batch_size = None  # 最大批次大小（初始为空）\n",
    "    max_possible_batch_size = 4096  # 设定最大可能的批次大小\n",
    "\n",
    "    # 通过二分法探索适合的批次大小\n",
    "    while min_batch_size <= max_possible_batch_size:\n",
    "        batch_size = (min_batch_size + max_possible_batch_size) // 2  # 计算当前批次大小\n",
    "        try:\n",
    "            # 创建输入张量，大小为（批次大小, 上下文长度）\n",
    "            input_tensor = torch.randint(\n",
    "                0, config[\"vocab_size\"],\n",
    "                (batch_size, config[\"context_length\"]),\n",
    "                device=device\n",
    "            )\n",
    "\n",
    "            # 创建GPT模型并转换为bfloat16类型\n",
    "            model = GPTModel(config).bfloat16().to(device)\n",
    "\n",
    "            # MACS = 乘加操作（Multiply-Accumulate operations）\n",
    "            # MACS通常计为两个FLOPS（一个乘法操作和一个加法操作）\n",
    "            macs, params = profile(model, inputs=(input_tensor,), verbose=False)\n",
    "            flops = 2 * macs  # 计算FLOPS：每个MACS算作2个FLOPS\n",
    "            print(f\"  批次大小 {batch_size}: {flops:.1e} FLOPS\")\n",
    "\n",
    "            # 如果成功，则尝试更大的批次大小\n",
    "            min_batch_size = batch_size + 1\n",
    "            max_batch_size = batch_size\n",
    "\n",
    "            # 清理模型和输入张量\n",
    "            del model, input_tensor\n",
    "            torch.cuda.empty_cache()\n",
    "\n",
    "        except RuntimeError as e:\n",
    "            if \"out of memory\" in str(e):\n",
    "                # 如果内存溢出，尝试更小的批次大小\n",
    "                max_possible_batch_size = batch_size - 1\n",
    "\n",
    "                # 清理模型和输入张量\n",
    "                try:\n",
    "                    del model, input_tensor\n",
    "                    torch.cuda.empty_cache()\n",
    "                except NameError:\n",
    "                    pass\n",
    "            else:\n",
    "                raise e  # 其他错误，重新抛出异常"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "V4lD7tfcgzPE"
   },
   "source": [
    "&nbsp;\n",
    "# 自动批量大小调整与模型FLOP利用率（MFU）基准测试"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "70Y2mblVgzPE"
   },
   "source": [
    "•\t**模型FLOP利用率（MFU）**的解释来源于PaLM论文\n",
    "\n",
    "> 我们提出了一种新的效率度量标准，它与实现方式无关，并允许更清晰地比较系统效率，称为模型FLOP利用率（MFU）。这是观察到的吞吐量（每秒处理的tokens）与理论最大吞吐量（系统在峰值FLOP时的处理能力）的比率。重要的是，“理论最大”吞吐量只考虑了计算前向和反向传递所需的操作，而不包括重计算操作。\n",
    "\n",
    "$$\\text{MFU} = \\frac{\\text{Observed Tokens per Second}}{\\text{Theoretical Max Tokens per Second}}$$\n",
    "\n",
    "其中\n",
    "\n",
    "$$\\text{Theoretical Max Tokens per Second} = \\frac{\\text{Max FLOPs per Second}}{\\text{Total FLOPs per Token}}$$\n",
    "\n",
    "并且\n",
    "\n",
    "$$\\text{Tokens per Second} = \\frac{\\text{Batch Size} \\times \\text{Sequence Length}}{\\text{Total Time}}$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "TKttjC8xgzPF"
   },
   "source": [
    "- 前向传播与反向传播"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "6aO4rjtNgzPF"
   },
   "outputs": [],
   "source": [
    "flops_per_second = {\n",
    "    # https://www.techpowerup.com/gpu-specs/h100-pcie-80-gb.c3899\n",
    "    \"H100\": {\n",
    "        torch.float32: 51.22e12,  # NVIDIA H100在FP32模式下的51.22 TFLOPs\n",
    "        torch.float16: 204.9e12,  # NVIDIA H100在FP16模式下的204.9 TFLOPs\n",
    "        torch.bfloat16: 204.9e12\n",
    "    },\n",
    "    # https://www.techpowerup.com/gpu-specs/l4.c4091\n",
    "    \"L4\": {\n",
    "        torch.float32: 30.29e12,  # NVIDIA L4在FP32模式下的30.29 TFLOPs\n",
    "        torch.float16: 30.29e12,  # NVIDIA L4在FP16模式下的30.29 TFLOPs\n",
    "        torch.bfloat16: 30.29e12\n",
    "    },\n",
    "    # https://www.techpowerup.com/gpu-specs/tesla-t4.c3316\n",
    "    \"T4\": {\n",
    "        torch.float32: 8.1e12,  # NVIDIA T4在FP32模式下的8.1 TFLOPs\n",
    "        torch.float16: 65.13e12,  # NVIDIA T4在FP16模式下的65.13 TFLOPs\n",
    "        torch.bfloat16: 65.13e12\n",
    "    },\n",
    "    # https://www.techpowerup.com/gpu-specs/a10g.c3798\n",
    "    \"A10G\": {\n",
    "        torch.float32: 31.52e12,  # NVIDIA A10G在FP32模式下的31.52 TFLOPs\n",
    "        torch.float16: 31.52e12,  # NVIDIA A10G在FP16模式下的31.52 TFLOPs\n",
    "        torch.bfloat16: 31.52e12\n",
    "    },\n",
    "    # https://www.techpowerup.com/gpu-specs/a100-pcie-40-gb.c3623\n",
    "    \"A100\": {\n",
    "        torch.float32: 19.49e12,  # NVIDIA A100在FP32模式下的19.49 TFLOPs\n",
    "        torch.float16: 77.97e12,  # NVIDIA A100在FP16模式下的77.97 TFLOPs\n",
    "        torch.bfloat16: 77.97e12\n",
    "    },\n",
    "    # https://www.techpowerup.com/gpu-specs/geforce-rtx-3080.c3621\n",
    "    \"RTX_3080\": {\n",
    "        torch.float32: 29.77e12,  # NVIDIA RTX 3080在FP32模式下的29.77 TFLOPs\n",
    "        torch.float16: 29.77e12,  # NVIDIA RTX 3080在FP16模式下的29.77 TFLOPs\n",
    "        torch.bfloat16: 29.77e12\n",
    "    },\n",
    "    # https://www.techpowerup.com/gpu-specs/geforce-rtx-3090.c3622\n",
    "    \"RTX_3090\": {\n",
    "        torch.float32: 35.58e12,  # NVIDIA RTX 3090在FP32模式下的35.58 TFLOPs\n",
    "        torch.float16: 35.58e12,  # NVIDIA RTX 3090在FP16模式下的35.58 TFLOPs\n",
    "        torch.bfloat16: 35.58e12\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "background_save": true,
     "base_uri": "https://localhost:8080/"
    },
    "id": "HW5qWfE7gzPF",
    "outputId": "bb1663bc-ee66-44f1-f54d-0bb66ee0d0c2"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GPU Model: A100\n",
      "\n",
      "Processing gpt-small (124M)\n",
      "  Batch size 16: Tokens/sec: 34248.82, MFU: 0.3256\n",
      "  Batch size 24: Tokens/sec: 62568.34, MFU: 0.5948\n",
      "\n",
      "Processing gpt-medium (355M)\n",
      "  Batch size 4: Tokens/sec: 20159.93, MFU: 0.5483\n",
      "  Batch size 6: Tokens/sec: 21717.66, MFU: 0.5907\n",
      "  Batch size 7: Tokens/sec: 22536.25, MFU: 0.6130\n",
      "\n",
      "Processing gpt-large (774M)\n",
      "  Batch size 8: Tokens/sec: 12465.21, MFU: 0.7406\n",
      "\n",
      "Processing gpt-xl (1558M)\n",
      "  Batch size 4: Tokens/sec: 6779.92, MFU: 0.8113\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "# 获取当前使用的GPU型号\n",
    "def get_gpu_model(flops_per_second_dict):\n",
    "    device_name = torch.cuda.get_device_name(0)  # 获取GPU设备的名称\n",
    "    for model in flops_per_second_dict.keys():  # 遍历flops_per_second字典中的GPU型号\n",
    "        if model in device_name:  # 如果当前设备名称包含字典中的某个GPU型号\n",
    "            return model  # 返回匹配的GPU型号\n",
    "    return \"Unknown\"  # 如果没有匹配的型号，则返回\"Unknown\"\n",
    "\n",
    "# 获取当前GPU型号\n",
    "gpu_model = get_gpu_model(flops_per_second)\n",
    "print(\"GPU Model:\", gpu_model)  # 输出GPU型号\n",
    "\n",
    "# 如果成功获取到GPU型号，则继续执行基准测试\n",
    "if gpu_model != \"Unknown\":\n",
    "\n",
    "    # 遍历不同的GPT模型配置\n",
    "    for size in model_configs:\n",
    "        print(f\"\\nProcessing {size}\")  # 打印当前正在处理的模型大小\n",
    "        config = BASE_CONFIG.copy()  # 复制基础配置\n",
    "        config.update(model_configs[size])  # 更新配置为当前模型配置\n",
    "\n",
    "        # 初始化最小批次大小，最大批次大小，和最大可能批次大小\n",
    "        min_batch_size = 1\n",
    "        max_batch_size = None\n",
    "        max_possible_batch_size = 4096  # 最大可能批次大小设为4096\n",
    "\n",
    "        # 进行批次大小的二分查找\n",
    "        while min_batch_size <= max_possible_batch_size:\n",
    "            batch_size = (min_batch_size + max_possible_batch_size) // 2  # 计算当前批次大小\n",
    "\n",
    "            try:\n",
    "                # 生成随机的输入数据，大小为(batch_size, context_length)\n",
    "                input_tensor = torch.randint(\n",
    "                    0, config[\"vocab_size\"],\n",
    "                    (batch_size, config[\"context_length\"]),\n",
    "                    device=device\n",
    "                )\n",
    "\n",
    "                # 初始化模型，使用bfloat16精度，并将模型加载到GPU上\n",
    "                model = GPTModel(config).bfloat16().to(device)\n",
    "                model.train()  # 设置模型为训练模式\n",
    "\n",
    "                # 记录开始时间\n",
    "                torch.cuda.synchronize()  # 确保所有CUDA操作已完成\n",
    "                start_time = time.time()\n",
    "\n",
    "                # 前向传播和反向传播\n",
    "                output = model(input_tensor)  # 执行前向传播\n",
    "                loss = output.sum()  # 计算损失（使用dummy loss）\n",
    "                loss.backward()  # 执行反向传播\n",
    "\n",
    "                # 记录结束时间\n",
    "                torch.cuda.synchronize()  # 确保所有CUDA操作已完成\n",
    "                end_time = time.time()\n",
    "\n",
    "                total_time_seconds = end_time - start_time  # 计算总用时\n",
    "\n",
    "                # 计算前向传播的FLOPs\n",
    "                macs, params = profile(model, inputs=(input_tensor,), verbose=False)  # 计算乘加操作次数\n",
    "                flops_forward = 2 * macs  # 假设一个MAC操作等于两个FLOP\n",
    "\n",
    "                # 估算反向传播的FLOPs，通常是前向传播的两倍\n",
    "                flops_backward = 2 * flops_forward\n",
    "\n",
    "                # 计算前向+反向传播的总FLOPs\n",
    "                total_flops = flops_forward + flops_backward  # 或者使用total_flops = flops_forward * 3\n",
    "\n",
    "                # 获取模型参数的数据类型\n",
    "                data_type = next(model.parameters()).dtype\n",
    "                max_flops_per_second = flops_per_second[gpu_model].get(data_type, 0)  # 获取GPU的最大FLOP性能\n",
    "\n",
    "                # 计算每秒处理的tokens数\n",
    "                tokens_processed = batch_size * config[\"context_length\"]  # 处理的tokens总数\n",
    "                tokens_per_second = tokens_processed / total_time_seconds  # 每秒处理的tokens数\n",
    "\n",
    "                # 计算每个token的FLOPs\n",
    "                flops_per_token = total_flops / tokens_processed\n",
    "\n",
    "                # 计算理论最大每秒处理的tokens数\n",
    "                if flops_per_token > 0:\n",
    "                    theoretical_max_tokens_per_second = max_flops_per_second / flops_per_token\n",
    "                else:\n",
    "                    theoretical_max_tokens_per_second = 0  # 避免除以零的错误\n",
    "\n",
    "                # 计算MFU（模型FLOPs利用率）\n",
    "                if theoretical_max_tokens_per_second > 0:\n",
    "                    mfu = tokens_per_second / theoretical_max_tokens_per_second\n",
    "                else:\n",
    "                    mfu = 0  # 避免除以零的错误\n",
    "\n",
    "                # 打印当前批次大小的性能数据\n",
    "                print(f\"  Batch size {batch_size}: Tokens/sec: {tokens_per_second:.2f}, MFU: {mfu:.4f}\")\n",
    "\n",
    "                # 如果当前批次处理成功，尝试更大的批次\n",
    "                min_batch_size = batch_size + 1\n",
    "                max_batch_size = batch_size\n",
    "\n",
    "                # 清理内存\n",
    "                del model, input_tensor, output, loss\n",
    "                torch.cuda.empty_cache()\n",
    "\n",
    "            except RuntimeError as e:\n",
    "                if \"out of memory\" in str(e).lower():  # 如果出现内存不足错误\n",
    "                    # 尝试减少批次大小\n",
    "                    max_possible_batch_size = batch_size - 1\n",
    "\n",
    "                    # 清理内存\n",
    "                    try:\n",
    "                        del model, input_tensor\n",
    "                        torch.cuda.empty_cache()\n",
    "                    except NameError:\n",
    "                        pass\n",
    "                else:\n",
    "                    raise e  # 如果是其他错误，抛出异常\n",
    "\n",
    "# 如果无法识别GPU型号，则提示更新flops_per_second字典\n",
    "else:\n",
    "    print(\"Unknown GPU model. Please update the flops_per_second dictionary with your GPU information.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "LovmswRigzPG"
   },
   "source": [
    "-\t1.0的值为最佳（等于100%）。\n",
    "-\t请注意，由于我们还进行反向传播操作，消耗更多的内存，因此这次选择的批次大小比之前小。"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "A100",
   "machine_shape": "hm",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}