模型权重分布

模型权重目录中model.safetensors.index.json保存每层模型中权重存放的文件,参考deepseek-r1模型权重:https://huggingface.co/deepseek-ai/DeepSeek-R1/tree/main

从此文件可知,deepseek-r1模型有58个MOE层,每层MOE包含一个门控,255个专家和1个共享专家。

{
  "weight_map": {
    "model.embed_tokens.weight": "model-00001-of-000163.safetensors",
    "model.layers.0.self_attn.q_a_proj.weight": "model-00001-of-000163.safetensors",
    "model.layers.0.self_attn.q_a_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
    ...
    "model.layers.3.mlp.gate.weight": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.gate.e_score_correction_bias": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.gate.e_score_correction_bias": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.shared_experts.gate_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.shared_experts.up_proj.weight": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.shared_experts.up_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.shared_experts.down_proj.weight": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.shared_experts.down_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.experts.0.gate_proj.weight": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.experts.0.gate_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.experts.0.up_proj.weight": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.experts.0.up_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.experts.0.down_proj.weight": "model-00001-of-000163.safetensors",
    "model.layers.3.mlp.experts.0.down_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
    ...
  }
}

查看专家权重

//获取权重参数脚本
#!/usr/bin/env python3

import os
from safetensors import safe_open

file_path = "/data/DeepSeek-R1/model-00019-of-000163.safetensors"

with safe_open(file_path, framework="pt") as f:
    for key in f.keys():
        tensor = f.get_tensor(key)
        num_elements = tensor.numel()
        size = tensor.size()
        dim = tensor.dim()
        if 'experts' in key:
            print('weight name:', key)
            print('weight tensor:', tensor)
            print('tensor num_elements:', num_elements)
            print('tensor size:', size)
            print('tensor dim:', dim)
            break

//执行结果
weight name: model.layers.9.mlp.experts.100.down_proj.weight
weight tensor: tensor([[ -20.0000,  104.0000,  -52.0000,  ...,    3.5000,   80.0000,
           15.0000],
        [ -28.0000,  -20.0000,   48.0000,  ...,   11.0000,  -26.0000,
           18.0000],
        [  -0.8750,   72.0000,   64.0000,  ...,   24.0000,  128.0000,
          120.0000],
        ...,
        [ -36.0000, -240.0000,  -64.0000,  ...,   40.0000,   44.0000,
           22.0000],
        [ -10.0000,  144.0000,    9.0000,  ...,   72.0000,  144.0000,
          -26.0000],
        [  44.0000,  -96.0000,   15.0000,  ...,  -20.0000,  -72.0000,
           44.0000]], dtype=torch.float8_e4m3fn)
tensor num_elements: 14680064
tensor size: torch.Size([7168, 2048])
tensor dim: 2

模型权重切分

权重切分可参考: https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/tree/main/inference/convert.py
以两机16卡为例,切分策略如下:

专家(不包括共享专家):按EP16专家切分,每张卡加载16个专家的完整权重
共享专家:按TP16张量切分,每张卡加载部分权重
门控:每张卡加载门控的完整权重

权重切分后,将每张卡需要的权重单独保存到文件

model0-mp16.safetensors   model12-mp16.safetensors  model15-mp16.safetensors  model3-mp16.safetensors  model6-mp16.safetensors  model9-mp16.safetensors
model10-mp16.safetensors  model13-mp16.safetensors  model1-mp16.safetensors   model4-mp16.safetensors  model7-mp16.safetensors
model11-mp16.safetensors  model14-mp16.safetensors  model2-mp16.safetensors   model5-mp16.safetensors  model8-mp16.safetensors

每个文件中权重如下,包括每层MOE的门控,16个专家和一个共享专家的部分张量

embed.weight
head.weight
layers.0.attn.kv_norm.weight
layers.0.attn.q_norm.weight
...
layers.3.ffn.gate.bias
layers.3.ffn.gate.weight
layers.3.ffn.shared_experts.w1.scale
layers.3.ffn.shared_experts.w1.weight
layers.3.ffn.shared_experts.w2.scale
layers.3.ffn.shared_experts.w2.weight
layers.3.ffn.shared_experts.w3.scale
layers.3.ffn.shared_experts.w3.weight
...
layers.3.ffn.experts.0.w1.scale
layers.3.ffn.experts.0.w1.weight
layers.3.ffn.experts.0.w2.scale
layers.3.ffn.experts.0.w2.weight
layers.3.ffn.experts.0.w3.scale
layers.3.ffn.experts.0.w3.weight
...
layers.3.ffn.experts.15.w1.scale
layers.3.ffn.experts.15.w1.weight
layers.3.ffn.experts.15.w2.scale
layers.3.ffn.experts.15.w2.weight
layers.3.ffn.experts.15.w3.scale
layers.3.ffn.experts.15.w3.weight

模型初始化

模型初始化可参考: https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/tree/main/inference/model.py

class Transformer(nn.Module):
    def __init__(self, args: ModelArgs):
        """
        Initializes the Transformer model.

        Args:
            args (ModelArgs): Model arguments containing transformer parameters.
        """
        global world_size, rank
        world_size = dist.get_world_size() if dist.is_initialized() else 1
        rank = dist.get_rank() if dist.is_initialized() else 0
        Linear.dtype = torch.float8_e4m3fn if args.dtype == "fp8" else torch.bfloat16
        super().__init__()
        self.max_seq_len = args.max_seq_len
        self.embed = ParallelEmbedding(args.vocab_size, args.dim)
        self.layers = torch.nn.ModuleList()
        for layer_id in range(args.n_layers):
            self.layers.append(Block(layer_id, args))
        self.norm = RMSNorm(args.dim)
        self.head = ColumnParallelLinear(args.dim, args.vocab_size, dtype=torch.get_default_dtype())
        self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)

初始化完成后,查看模型结构

Transformer(
  (embed): ParallelEmbedding() //对应权重embed.weight
  (layers): ModuleList(
    (0-2): 3 x Block(
      (attn): MLA(
        (wq_a): Linear()
        (q_norm): RMSNorm()
        (wq_b): ColumnParallelLinear()
        (wkv_a): Linear()
        (kv_norm): RMSNorm()  //对应权重layers.0.attn.kv_norm.weight
        (wkv_b): ColumnParallelLinear()
        (wo): RowParallelLinear()
      )
      (ffn): MLP(
        (w1): ColumnParallelLinear()
        (w2): RowParallelLinear()
        (w3): ColumnParallelLinear()
      )
      (attn_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
    (3-60): 58 x Block(  //58个MOE层
      (attn): MLA(
        (wq_a): Linear()
        (q_norm): RMSNorm()
        (wq_b): ColumnParallelLinear()
        (wkv_a): Linear()
        (kv_norm): RMSNorm()
        (wkv_b): ColumnParallelLinear()
        (wo): RowParallelLinear()
      )
      (ffn): MoE(
        (gate): Gate()  //每个MOE一个门控,对应权重layers.x.ffn.gate.weight
        (experts): ModuleList( //每个MOE16个专家
          (0-15): 16 x Expert(
            (w1): Linear()  //对应权重layers.x.ffn.experts.0.w1.weight
            (w2): Linear()
            (w3): Linear()
          )
          (16-255): 240 x None
        )
        (shared_experts): MLP( //每个MOE一个共享专家
          (w1): ColumnParallelLinear()
          (w2): RowParallelLinear()
          (w3): ColumnParallelLinear()
        )
      )
      (attn_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (head): ColumnParallelLinear()
)

模型权重加载

权重加载可参考:https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/tree/main/inference/generate.py

每张卡加载属于它自己的权重文件:model{rank}-mp{world_size}.safetensors"

load_model(model, os.path.join(ckpt_path, f"model{rank}-mp{world_size}.safetensors"))

模型推理

模型推理可参考:https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/tree/main/inference/model.py

通过门控选择专家全局和索引,如果选中的专家在本卡则执行专家处理流程,最后通过all_reduce汇总结果

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass for the MoE module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after expert routing and computation.
        """
        shape = x.size()
        x = x.view(-1, self.dim)
        weights, indices = self.gate(x)   //门控输出选择的专家权重和索引
        y = torch.zeros_like(x)
        counts = torch.bincount(indices.flatten(), minlength=self.n_routed_experts).tolist()
        for i in range(self.experts_start_idx, self.experts_end_idx):
            if counts[i] == 0:
                continue
            expert = self.experts[i]
            idx, top = torch.where(indices == i)
            y[idx] += expert(x[idx]) * weights[idx, top, None]
        z = self.shared_experts(x)
        if world_size > 1:
            dist.all_reduce(y)
        return (y + z).view(shape)
Logo

欢迎加入DeepSeek 技术社区。在这里,你可以找到志同道合的朋友,共同探索AI技术的奥秘。

更多推荐