{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "FtQYMbLvgzO-" }, "source": [ "\n", "\n", "\n", "\n", "\n", "
\n", "\n", "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", "
汉化的库: https://github.com/GoatCsu/CN-LLMs-from-scratch.git\n", "
\n", "
\n", "\n", "
\n" ] }, { "cell_type": "markdown", "metadata": { "id": "EbrESHKtgzPA" }, "source": [ "# FLOPS分析" ] }, { "cell_type": "markdown", "metadata": { "id": "xS2WjniMgzPB" }, "source": [ "- FLOPs(每秒浮点运算数)衡量神经网络的计算复杂度。\n", "- 高FLOPs计算更加复杂,能耗更高。" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "L01-NzkggzPB" }, "outputs": [], "source": [ "# pip install -r requirements-extra.txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ObzfVatqgzPC", "outputId": "3ead6a41-ac38-4db1-9fc3-012fb3ad18cd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "thop version: 0.1.1-2209072238\n", "torch version: 2.4.1+cu121\n" ] } ], "source": [ "from importlib.metadata import version\n", "\n", "pkgs = [\n", " \"thop\",\n", " \"torch\",\n", "]\n", "for p in pkgs:\n", " print(f\"{p} version: {version(p)}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "74UpjSLjgzPC" }, "source": [ " \n", "# 固定批次大小的基准测试" ] }, { "cell_type": "markdown", "metadata": { "id": "90pnCK39gzPD" }, "source": [ "- 仅有前向传播" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GerIdRMXd6g9", "outputId": "177c6d00-a817-40fe-badd-95cfa8ac9b51" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gpt-small (124M) : 5.1e+11 FLOPS\n", "gpt-medium (355M) : 1.4e+12 FLOPS\n", "gpt-large (774M) : 3.2e+12 FLOPS\n", "gpt-xl (1558M) : 6.4e+12 FLOPS\n" ] } ], "source": [ "import torch\n", "from thop import profile\n", "\n", "from previous_chapters import GPTModel\n", "\n", "# 基本配置\n", "BASE_CONFIG = {\n", " \"vocab_size\": 50257, # 词汇表大小\n", " \"context_length\": 1024, # 上下文长度\n", " \"drop_rate\": 0.0, # 丢弃率\n", " \"qkv_bias\": True # 是否使用查询-键-值偏置\n", "}\n", "\n", "# 不同规模的GPT模型配置\n", "model_configs = {\n", " \"gpt-small (124M)\": {\"emb_dim\": 768, \"n_layers\": 12, \"n_heads\": 12}, # 小型模型\n", " \"gpt-medium (355M)\": {\"emb_dim\": 1024, \"n_layers\": 24, \"n_heads\": 16}, # 中型模型\n", " \"gpt-large (774M)\": {\"emb_dim\": 1280, \"n_layers\": 36, \"n_heads\": 20}, # 大型模型\n", " \"gpt-xl (1558M)\": {\"emb_dim\": 1600, \"n_layers\": 48, \"n_heads\": 25}, # 超大模型\n", "}\n", "\n", "# 设置设备(优先使用GPU,如果没有则使用CPU)\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# 批次大小\n", "batch_size = 2\n", "# 输入张量:一个大小为(批次大小, 上下文长度)的随机整数张量\n", "input_tensor = torch.randint(0, 50257, (batch_size, 1024)).to(device)\n", "\n", "# 对每个模型配置进行遍历\n", "for size in model_configs:\n", " # 更新基础配置\n", " BASE_CONFIG.update(model_configs[size])\n", "\n", " # 创建GPT模型并转换为bfloat16类型\n", " model = GPTModel(BASE_CONFIG).bfloat16()\n", " model.to(device)\n", "\n", " # MACS(乘加操作)= 浮动计算操作的一种\n", " # MACS通常被认为是两个FLOPS(一个乘法和一个加法)\n", " macs, params = profile(model, inputs=(input_tensor,), verbose=False)\n", " flops = 2*macs # 计算FLOPS:每个MACS算作2个FLOPS\n", " print(f\"{size:18}: {flops:.1e} FLOPS\") # 输出FLOPS\n", "\n", " # 清除模型并释放GPU缓存\n", " del model\n", " torch.cuda.empty_cache()" ] }, { "cell_type": "markdown", "metadata": { "id": "_S6V05QmgzPD" }, "source": [ " \n", "# 固定批次大小的简单基准测试" ] }, { "cell_type": "markdown", "metadata": { "id": "amw4E983gzPD" }, "source": [ "- 仅有前向传播" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "h08VOiqpgzPE", "outputId": "a6a90ef8-28fb-4b55-9268-6915b0c84c51" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Processing gpt-small (124M)\n", " Batch size 256: 6.5e+13 FLOPS\n", " Batch size 384: 9.7e+13 FLOPS\n", " Batch size 388: 9.8e+13 FLOPS\n", " Batch size 389: 9.8e+13 FLOPS\n", "\n", "Processing gpt-medium (355M)\n", " Batch size 256: 1.9e+14 FLOPS\n", " Batch size 260: 1.9e+14 FLOPS\n", " Batch size 262: 1.9e+14 FLOPS\n", " Batch size 263: 1.9e+14 FLOPS\n", "\n", "Processing gpt-large (774M)\n", " Batch size 256: 4.0e+14 FLOPS\n", "\n", "Processing gpt-xl (1558M)\n", " Batch size 128: 4.1e+14 FLOPS\n", " Batch size 136: 4.3e+14 FLOPS\n", " Batch size 140: 4.5e+14 FLOPS\n", " Batch size 142: 4.5e+14 FLOPS\n", " Batch size 143: 4.6e+14 FLOPS\n" ] } ], "source": [ "for size in model_configs:\n", " print(f\"\\n正在处理 {size}\")\n", " config = BASE_CONFIG.copy() # 复制基础配置\n", " config.update(model_configs[size]) # 更新配置为对应模型的配置\n", "\n", " min_batch_size = 1 # 最小批次大小\n", " max_batch_size = None # 最大批次大小(初始为空)\n", " max_possible_batch_size = 4096 # 设定最大可能的批次大小\n", "\n", " # 通过二分法探索适合的批次大小\n", " while min_batch_size <= max_possible_batch_size:\n", " batch_size = (min_batch_size + max_possible_batch_size) // 2 # 计算当前批次大小\n", " try:\n", " # 创建输入张量,大小为(批次大小, 上下文长度)\n", " input_tensor = torch.randint(\n", " 0, config[\"vocab_size\"],\n", " (batch_size, config[\"context_length\"]),\n", " device=device\n", " )\n", "\n", " # 创建GPT模型并转换为bfloat16类型\n", " model = GPTModel(config).bfloat16().to(device)\n", "\n", " # MACS = 乘加操作(Multiply-Accumulate operations)\n", " # MACS通常计为两个FLOPS(一个乘法操作和一个加法操作)\n", " macs, params = profile(model, inputs=(input_tensor,), verbose=False)\n", " flops = 2 * macs # 计算FLOPS:每个MACS算作2个FLOPS\n", " print(f\" 批次大小 {batch_size}: {flops:.1e} FLOPS\")\n", "\n", " # 如果成功,则尝试更大的批次大小\n", " min_batch_size = batch_size + 1\n", " max_batch_size = batch_size\n", "\n", " # 清理模型和输入张量\n", " del model, input_tensor\n", " torch.cuda.empty_cache()\n", "\n", " except RuntimeError as e:\n", " if \"out of memory\" in str(e):\n", " # 如果内存溢出,尝试更小的批次大小\n", " max_possible_batch_size = batch_size - 1\n", "\n", " # 清理模型和输入张量\n", " try:\n", " del model, input_tensor\n", " torch.cuda.empty_cache()\n", " except NameError:\n", " pass\n", " else:\n", " raise e # 其他错误,重新抛出异常" ] }, { "cell_type": "markdown", "metadata": { "id": "V4lD7tfcgzPE" }, "source": [ " \n", "# 自动批量大小调整与模型FLOP利用率(MFU)基准测试" ] }, { "cell_type": "markdown", "metadata": { "id": "70Y2mblVgzPE" }, "source": [ "•\t**模型FLOP利用率(MFU)**的解释来源于PaLM论文\n", "\n", "> 我们提出了一种新的效率度量标准,它与实现方式无关,并允许更清晰地比较系统效率,称为模型FLOP利用率(MFU)。这是观察到的吞吐量(每秒处理的tokens)与理论最大吞吐量(系统在峰值FLOP时的处理能力)的比率。重要的是,“理论最大”吞吐量只考虑了计算前向和反向传递所需的操作,而不包括重计算操作。\n", "\n", "$$\\text{MFU} = \\frac{\\text{Observed Tokens per Second}}{\\text{Theoretical Max Tokens per Second}}$$\n", "\n", "其中\n", "\n", "$$\\text{Theoretical Max Tokens per Second} = \\frac{\\text{Max FLOPs per Second}}{\\text{Total FLOPs per Token}}$$\n", "\n", "并且\n", "\n", "$$\\text{Tokens per Second} = \\frac{\\text{Batch Size} \\times \\text{Sequence Length}}{\\text{Total Time}}$$" ] }, { "cell_type": "markdown", "metadata": { "id": "TKttjC8xgzPF" }, "source": [ "- 前向传播与反向传播" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6aO4rjtNgzPF" }, "outputs": [], "source": [ "flops_per_second = {\n", " # https://www.techpowerup.com/gpu-specs/h100-pcie-80-gb.c3899\n", " \"H100\": {\n", " torch.float32: 51.22e12, # NVIDIA H100在FP32模式下的51.22 TFLOPs\n", " torch.float16: 204.9e12, # NVIDIA H100在FP16模式下的204.9 TFLOPs\n", " torch.bfloat16: 204.9e12\n", " },\n", " # https://www.techpowerup.com/gpu-specs/l4.c4091\n", " \"L4\": {\n", " torch.float32: 30.29e12, # NVIDIA L4在FP32模式下的30.29 TFLOPs\n", " torch.float16: 30.29e12, # NVIDIA L4在FP16模式下的30.29 TFLOPs\n", " torch.bfloat16: 30.29e12\n", " },\n", " # https://www.techpowerup.com/gpu-specs/tesla-t4.c3316\n", " \"T4\": {\n", " torch.float32: 8.1e12, # NVIDIA T4在FP32模式下的8.1 TFLOPs\n", " torch.float16: 65.13e12, # NVIDIA T4在FP16模式下的65.13 TFLOPs\n", " torch.bfloat16: 65.13e12\n", " },\n", " # https://www.techpowerup.com/gpu-specs/a10g.c3798\n", " \"A10G\": {\n", " torch.float32: 31.52e12, # NVIDIA A10G在FP32模式下的31.52 TFLOPs\n", " torch.float16: 31.52e12, # NVIDIA A10G在FP16模式下的31.52 TFLOPs\n", " torch.bfloat16: 31.52e12\n", " },\n", " # https://www.techpowerup.com/gpu-specs/a100-pcie-40-gb.c3623\n", " \"A100\": {\n", " torch.float32: 19.49e12, # NVIDIA A100在FP32模式下的19.49 TFLOPs\n", " torch.float16: 77.97e12, # NVIDIA A100在FP16模式下的77.97 TFLOPs\n", " torch.bfloat16: 77.97e12\n", " },\n", " # https://www.techpowerup.com/gpu-specs/geforce-rtx-3080.c3621\n", " \"RTX_3080\": {\n", " torch.float32: 29.77e12, # NVIDIA RTX 3080在FP32模式下的29.77 TFLOPs\n", " torch.float16: 29.77e12, # NVIDIA RTX 3080在FP16模式下的29.77 TFLOPs\n", " torch.bfloat16: 29.77e12\n", " },\n", " # https://www.techpowerup.com/gpu-specs/geforce-rtx-3090.c3622\n", " \"RTX_3090\": {\n", " torch.float32: 35.58e12, # NVIDIA RTX 3090在FP32模式下的35.58 TFLOPs\n", " torch.float16: 35.58e12, # NVIDIA RTX 3090在FP16模式下的35.58 TFLOPs\n", " torch.bfloat16: 35.58e12\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "background_save": true, "base_uri": "https://localhost:8080/" }, "id": "HW5qWfE7gzPF", "outputId": "bb1663bc-ee66-44f1-f54d-0bb66ee0d0c2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GPU Model: A100\n", "\n", "Processing gpt-small (124M)\n", " Batch size 16: Tokens/sec: 34248.82, MFU: 0.3256\n", " Batch size 24: Tokens/sec: 62568.34, MFU: 0.5948\n", "\n", "Processing gpt-medium (355M)\n", " Batch size 4: Tokens/sec: 20159.93, MFU: 0.5483\n", " Batch size 6: Tokens/sec: 21717.66, MFU: 0.5907\n", " Batch size 7: Tokens/sec: 22536.25, MFU: 0.6130\n", "\n", "Processing gpt-large (774M)\n", " Batch size 8: Tokens/sec: 12465.21, MFU: 0.7406\n", "\n", "Processing gpt-xl (1558M)\n", " Batch size 4: Tokens/sec: 6779.92, MFU: 0.8113\n" ] } ], "source": [ "import time\n", "\n", "# 获取当前使用的GPU型号\n", "def get_gpu_model(flops_per_second_dict):\n", " device_name = torch.cuda.get_device_name(0) # 获取GPU设备的名称\n", " for model in flops_per_second_dict.keys(): # 遍历flops_per_second字典中的GPU型号\n", " if model in device_name: # 如果当前设备名称包含字典中的某个GPU型号\n", " return model # 返回匹配的GPU型号\n", " return \"Unknown\" # 如果没有匹配的型号,则返回\"Unknown\"\n", "\n", "# 获取当前GPU型号\n", "gpu_model = get_gpu_model(flops_per_second)\n", "print(\"GPU Model:\", gpu_model) # 输出GPU型号\n", "\n", "# 如果成功获取到GPU型号,则继续执行基准测试\n", "if gpu_model != \"Unknown\":\n", "\n", " # 遍历不同的GPT模型配置\n", " for size in model_configs:\n", " print(f\"\\nProcessing {size}\") # 打印当前正在处理的模型大小\n", " config = BASE_CONFIG.copy() # 复制基础配置\n", " config.update(model_configs[size]) # 更新配置为当前模型配置\n", "\n", " # 初始化最小批次大小,最大批次大小,和最大可能批次大小\n", " min_batch_size = 1\n", " max_batch_size = None\n", " max_possible_batch_size = 4096 # 最大可能批次大小设为4096\n", "\n", " # 进行批次大小的二分查找\n", " while min_batch_size <= max_possible_batch_size:\n", " batch_size = (min_batch_size + max_possible_batch_size) // 2 # 计算当前批次大小\n", "\n", " try:\n", " # 生成随机的输入数据,大小为(batch_size, context_length)\n", " input_tensor = torch.randint(\n", " 0, config[\"vocab_size\"],\n", " (batch_size, config[\"context_length\"]),\n", " device=device\n", " )\n", "\n", " # 初始化模型,使用bfloat16精度,并将模型加载到GPU上\n", " model = GPTModel(config).bfloat16().to(device)\n", " model.train() # 设置模型为训练模式\n", "\n", " # 记录开始时间\n", " torch.cuda.synchronize() # 确保所有CUDA操作已完成\n", " start_time = time.time()\n", "\n", " # 前向传播和反向传播\n", " output = model(input_tensor) # 执行前向传播\n", " loss = output.sum() # 计算损失(使用dummy loss)\n", " loss.backward() # 执行反向传播\n", "\n", " # 记录结束时间\n", " torch.cuda.synchronize() # 确保所有CUDA操作已完成\n", " end_time = time.time()\n", "\n", " total_time_seconds = end_time - start_time # 计算总用时\n", "\n", " # 计算前向传播的FLOPs\n", " macs, params = profile(model, inputs=(input_tensor,), verbose=False) # 计算乘加操作次数\n", " flops_forward = 2 * macs # 假设一个MAC操作等于两个FLOP\n", "\n", " # 估算反向传播的FLOPs,通常是前向传播的两倍\n", " flops_backward = 2 * flops_forward\n", "\n", " # 计算前向+反向传播的总FLOPs\n", " total_flops = flops_forward + flops_backward # 或者使用total_flops = flops_forward * 3\n", "\n", " # 获取模型参数的数据类型\n", " data_type = next(model.parameters()).dtype\n", " max_flops_per_second = flops_per_second[gpu_model].get(data_type, 0) # 获取GPU的最大FLOP性能\n", "\n", " # 计算每秒处理的tokens数\n", " tokens_processed = batch_size * config[\"context_length\"] # 处理的tokens总数\n", " tokens_per_second = tokens_processed / total_time_seconds # 每秒处理的tokens数\n", "\n", " # 计算每个token的FLOPs\n", " flops_per_token = total_flops / tokens_processed\n", "\n", " # 计算理论最大每秒处理的tokens数\n", " if flops_per_token > 0:\n", " theoretical_max_tokens_per_second = max_flops_per_second / flops_per_token\n", " else:\n", " theoretical_max_tokens_per_second = 0 # 避免除以零的错误\n", "\n", " # 计算MFU(模型FLOPs利用率)\n", " if theoretical_max_tokens_per_second > 0:\n", " mfu = tokens_per_second / theoretical_max_tokens_per_second\n", " else:\n", " mfu = 0 # 避免除以零的错误\n", "\n", " # 打印当前批次大小的性能数据\n", " print(f\" Batch size {batch_size}: Tokens/sec: {tokens_per_second:.2f}, MFU: {mfu:.4f}\")\n", "\n", " # 如果当前批次处理成功,尝试更大的批次\n", " min_batch_size = batch_size + 1\n", " max_batch_size = batch_size\n", "\n", " # 清理内存\n", " del model, input_tensor, output, loss\n", " torch.cuda.empty_cache()\n", "\n", " except RuntimeError as e:\n", " if \"out of memory\" in str(e).lower(): # 如果出现内存不足错误\n", " # 尝试减少批次大小\n", " max_possible_batch_size = batch_size - 1\n", "\n", " # 清理内存\n", " try:\n", " del model, input_tensor\n", " torch.cuda.empty_cache()\n", " except NameError:\n", " pass\n", " else:\n", " raise e # 如果是其他错误,抛出异常\n", "\n", "# 如果无法识别GPU型号,则提示更新flops_per_second字典\n", "else:\n", " print(\"Unknown GPU model. Please update the flops_per_second dictionary with your GPU information.\")" ] }, { "cell_type": "markdown", "metadata": { "id": "LovmswRigzPG" }, "source": [ "-\t1.0的值为最佳(等于100%)。\n", "-\t请注意,由于我们还进行反向传播操作,消耗更多的内存,因此这次选择的批次大小比之前小。" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "A100", "machine_shape": "hm", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 4 }