deepseek模型权重和官方推理
模型权重目录中model.safetensors.index.json保存每层模型中权重存放的文件,参考deepseek-r1模型权重:https://huggingface.co/deepseek-ai/DeepSeek-R1/tree/main从此文件可知,deepseek-r1模型有58个MOE层,每层MOE包含一个门控,255个专家和1个共享专家。查看专家权重。
模型权重分布
模型权重目录中model.safetensors.index.json保存每层模型中权重存放的文件,参考deepseek-r1模型权重:https://huggingface.co/deepseek-ai/DeepSeek-R1/tree/main
从此文件可知,deepseek-r1模型有58个MOE层,每层MOE包含一个门控,255个专家和1个共享专家。
{
"weight_map": {
"model.embed_tokens.weight": "model-00001-of-000163.safetensors",
"model.layers.0.self_attn.q_a_proj.weight": "model-00001-of-000163.safetensors",
"model.layers.0.self_attn.q_a_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
...
"model.layers.3.mlp.gate.weight": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.gate.e_score_correction_bias": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.gate.e_score_correction_bias": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.shared_experts.gate_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.shared_experts.up_proj.weight": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.shared_experts.up_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.shared_experts.down_proj.weight": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.shared_experts.down_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.experts.0.gate_proj.weight": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.experts.0.gate_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.experts.0.up_proj.weight": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.experts.0.up_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.experts.0.down_proj.weight": "model-00001-of-000163.safetensors",
"model.layers.3.mlp.experts.0.down_proj.weight_scale_inv": "model-00001-of-000163.safetensors",
...
}
}
查看专家权重
//获取权重参数脚本
#!/usr/bin/env python3
import os
from safetensors import safe_open
file_path = "/data/DeepSeek-R1/model-00019-of-000163.safetensors"
with safe_open(file_path, framework="pt") as f:
for key in f.keys():
tensor = f.get_tensor(key)
num_elements = tensor.numel()
size = tensor.size()
dim = tensor.dim()
if 'experts' in key:
print('weight name:', key)
print('weight tensor:', tensor)
print('tensor num_elements:', num_elements)
print('tensor size:', size)
print('tensor dim:', dim)
break
//执行结果
weight name: model.layers.9.mlp.experts.100.down_proj.weight
weight tensor: tensor([[ -20.0000, 104.0000, -52.0000, ..., 3.5000, 80.0000,
15.0000],
[ -28.0000, -20.0000, 48.0000, ..., 11.0000, -26.0000,
18.0000],
[ -0.8750, 72.0000, 64.0000, ..., 24.0000, 128.0000,
120.0000],
...,
[ -36.0000, -240.0000, -64.0000, ..., 40.0000, 44.0000,
22.0000],
[ -10.0000, 144.0000, 9.0000, ..., 72.0000, 144.0000,
-26.0000],
[ 44.0000, -96.0000, 15.0000, ..., -20.0000, -72.0000,
44.0000]], dtype=torch.float8_e4m3fn)
tensor num_elements: 14680064
tensor size: torch.Size([7168, 2048])
tensor dim: 2
模型权重切分
权重切分可参考: https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/tree/main/inference/convert.py
以两机16卡为例,切分策略如下:
专家(不包括共享专家):按EP16专家切分,每张卡加载16个专家的完整权重
共享专家:按TP16张量切分,每张卡加载部分权重
门控:每张卡加载门控的完整权重
权重切分后,将每张卡需要的权重单独保存到文件
model0-mp16.safetensors model12-mp16.safetensors model15-mp16.safetensors model3-mp16.safetensors model6-mp16.safetensors model9-mp16.safetensors
model10-mp16.safetensors model13-mp16.safetensors model1-mp16.safetensors model4-mp16.safetensors model7-mp16.safetensors
model11-mp16.safetensors model14-mp16.safetensors model2-mp16.safetensors model5-mp16.safetensors model8-mp16.safetensors
每个文件中权重如下,包括每层MOE的门控,16个专家和一个共享专家的部分张量
embed.weight
head.weight
layers.0.attn.kv_norm.weight
layers.0.attn.q_norm.weight
...
layers.3.ffn.gate.bias
layers.3.ffn.gate.weight
layers.3.ffn.shared_experts.w1.scale
layers.3.ffn.shared_experts.w1.weight
layers.3.ffn.shared_experts.w2.scale
layers.3.ffn.shared_experts.w2.weight
layers.3.ffn.shared_experts.w3.scale
layers.3.ffn.shared_experts.w3.weight
...
layers.3.ffn.experts.0.w1.scale
layers.3.ffn.experts.0.w1.weight
layers.3.ffn.experts.0.w2.scale
layers.3.ffn.experts.0.w2.weight
layers.3.ffn.experts.0.w3.scale
layers.3.ffn.experts.0.w3.weight
...
layers.3.ffn.experts.15.w1.scale
layers.3.ffn.experts.15.w1.weight
layers.3.ffn.experts.15.w2.scale
layers.3.ffn.experts.15.w2.weight
layers.3.ffn.experts.15.w3.scale
layers.3.ffn.experts.15.w3.weight
模型初始化
模型初始化可参考: https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/tree/main/inference/model.py
class Transformer(nn.Module):
def __init__(self, args: ModelArgs):
"""
Initializes the Transformer model.
Args:
args (ModelArgs): Model arguments containing transformer parameters.
"""
global world_size, rank
world_size = dist.get_world_size() if dist.is_initialized() else 1
rank = dist.get_rank() if dist.is_initialized() else 0
Linear.dtype = torch.float8_e4m3fn if args.dtype == "fp8" else torch.bfloat16
super().__init__()
self.max_seq_len = args.max_seq_len
self.embed = ParallelEmbedding(args.vocab_size, args.dim)
self.layers = torch.nn.ModuleList()
for layer_id in range(args.n_layers):
self.layers.append(Block(layer_id, args))
self.norm = RMSNorm(args.dim)
self.head = ColumnParallelLinear(args.dim, args.vocab_size, dtype=torch.get_default_dtype())
self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
初始化完成后,查看模型结构
Transformer(
(embed): ParallelEmbedding() //对应权重embed.weight
(layers): ModuleList(
(0-2): 3 x Block(
(attn): MLA(
(wq_a): Linear()
(q_norm): RMSNorm()
(wq_b): ColumnParallelLinear()
(wkv_a): Linear()
(kv_norm): RMSNorm() //对应权重layers.0.attn.kv_norm.weight
(wkv_b): ColumnParallelLinear()
(wo): RowParallelLinear()
)
(ffn): MLP(
(w1): ColumnParallelLinear()
(w2): RowParallelLinear()
(w3): ColumnParallelLinear()
)
(attn_norm): RMSNorm()
(ffn_norm): RMSNorm()
)
(3-60): 58 x Block( //58个MOE层
(attn): MLA(
(wq_a): Linear()
(q_norm): RMSNorm()
(wq_b): ColumnParallelLinear()
(wkv_a): Linear()
(kv_norm): RMSNorm()
(wkv_b): ColumnParallelLinear()
(wo): RowParallelLinear()
)
(ffn): MoE(
(gate): Gate() //每个MOE一个门控,对应权重layers.x.ffn.gate.weight
(experts): ModuleList( //每个MOE16个专家
(0-15): 16 x Expert(
(w1): Linear() //对应权重layers.x.ffn.experts.0.w1.weight
(w2): Linear()
(w3): Linear()
)
(16-255): 240 x None
)
(shared_experts): MLP( //每个MOE一个共享专家
(w1): ColumnParallelLinear()
(w2): RowParallelLinear()
(w3): ColumnParallelLinear()
)
)
(attn_norm): RMSNorm()
(ffn_norm): RMSNorm()
)
)
(norm): RMSNorm()
(head): ColumnParallelLinear()
)
模型权重加载
权重加载可参考:https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/tree/main/inference/generate.py
每张卡加载属于它自己的权重文件:model{rank}-mp{world_size}.safetensors"
load_model(model, os.path.join(ckpt_path, f"model{rank}-mp{world_size}.safetensors"))
模型推理
模型推理可参考:https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/tree/main/inference/model.py
通过门控选择专家全局和索引,如果选中的专家在本卡则执行专家处理流程,最后通过all_reduce汇总结果
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Forward pass for the MoE module.
Args:
x (torch.Tensor): Input tensor.
Returns:
torch.Tensor: Output tensor after expert routing and computation.
"""
shape = x.size()
x = x.view(-1, self.dim)
weights, indices = self.gate(x) //门控输出选择的专家权重和索引
y = torch.zeros_like(x)
counts = torch.bincount(indices.flatten(), minlength=self.n_routed_experts).tolist()
for i in range(self.experts_start_idx, self.experts_end_idx):
if counts[i] == 0:
continue
expert = self.experts[i]
idx, top = torch.where(indices == i)
y[idx] += expert(x[idx]) * weights[idx, top, None]
z = self.shared_experts(x)
if world_size > 1:
dist.all_reduce(y)
return (y + z).view(shape)
更多推荐
所有评论(0)