昇腾910B部署Qwen3-32B模型(MindIE推理服务框架)
·
环境准备
docker安装
# 配置内核参数
echo "net.bridge.bridge-nf-call-ip6tables = 1" >> /etc/sysctl.conf
echo "net.bridge.bridge-nf-call-iptables=1" >> /etc/sysctl.conf
echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf
echo "1" >/proc/sys/net/bridge/bridge-nf-call-iptables
sysctl -p
# 查看docker版本
dnf list docker-ce --showduplicates | sort -r
# 安装docker
dnf install docker-ce-26.1.3-1.el8 -y
# 安装docker-compose
wget "https://github.com/docker/compose/releases/download/v2.40.3/docker-compose-$(uname -s)-$(uname -m)"
chmod +x docker-compose-Linux-x86_64
mv docker-compose-Linux-x86_64 /usr/bin/docker-compose
镜像下载
MindIE:(全称 Mind Inference Engine,昇腾推理引擎)是华为昇腾(Ascend)软件栈中专门用于 AI 推理加速 的核心组件。
# 下载MindIE镜像
docker pull --platform linux/arm64 swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:.3.0-800I-A2-py311-openeuler24.03-lts
模型下载
- 魔塔社区模型库下载
Qwen/Qwen3-32B
# 安装modelscope
python -m ensurepip --upgrade
python -m pip install modelscope -i https://pypi.tuna.tsinghua.edu.cn/simple
## 下载Qwen3-32B
python -m modelscope.cli.cli download --model Qwen/Qwen3-32B
模型部署
# 创建目录
mkdir -pv /data/Qwen3-32B/{images,models,config,scripts}
cd /data/Qwen3-32B
# 模型拷贝到models
ls -1 models/
Qwen3-32B
# 修改模型配置
vim Qwen3-32B/config.json
## 修改如下内容
"torch_dtype": "float16",
# 把镜像拷贝到images
ls -1 images/
mindie.tar
# 导入镜像(mindie镜像为通用镜像, 后面几个是用mindie镜像启动的模型可以省略此步骤)
ls *.tar | xargs -I {} docker load -i {}
mindie配置
# mindie-service配置文件持久化
cd /data/Qwen3-32B
cat > ./config/config.json << EOF
{
"Version" : "1.0.0",
"ServerConfig" :
{
"ipAddress" : "172.16.0.101",
"managementIpAddress" : "172.16.0.101",
"port" : 18025,
"managementPort" : 18026,
"metricsPort" : 18027,
"allowAllZeroIpListening" : false,
"maxLinkNum" : 1000,
"httpsEnabled" : false,
"fullTextEnabled" : false,
"tlsCaPath" : "security/ca/",
"tlsCaFile" : ["ca.pem"],
"tlsCert" : "security/certs/server.pem",
"tlsPk" : "security/keys/server.key.pem",
"tlsPkPwd" : "security/pass/key_pwd.txt",
"tlsCrlPath" : "security/certs/",
"tlsCrlFiles" : ["server_crl.pem"],
"managementTlsCaFile" : ["management_ca.pem"],
"managementTlsCert" : "security/certs/management/server.pem",
"managementTlsPk" : "security/keys/management/server.key.pem",
"managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
"managementTlsCrlPath" : "security/management/certs/",
"managementTlsCrlFiles" : ["server_crl.pem"],
"metricsTlsCaFile" : ["metrics_ca.pem"],
"metricsTlsCert" : "security/certs/metrics/server.pem",
"metricsTlsPk" : "security/keys/metrics/server.key.pem",
"metricsTlsPkPwd" : "security/pass/metrics/key_pwd.txt",
"metricsTlsCrlPath" : "security/metrics/certs/",
"metricsTlsCrlFiles" : ["server_crl.pem"],
"kmcKsfMaster" : "tools/pmt/master/ksfa",
"kmcKsfStandby" : "tools/pmt/standby/ksfb",
"inferMode" : "standard",
"interCommTLSEnabled" : true,
"interCommPort" : 1121,
"interCommTlsCaPath" : "security/grpc/ca/",
"interCommTlsCaFiles" : ["ca.pem"],
"interCommTlsCert" : "security/grpc/certs/server.pem",
"interCommPk" : "security/grpc/keys/server.key.pem",
"interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
"interCommTlsCrlPath" : "security/grpc/certs/",
"interCommTlsCrlFiles" : ["server_crl.pem"],
"openAiSupport" : "vllm",
"tokenTimeout" : 600,
"e2eTimeout" : 600,
"distDPServerEnabled":false
},
"BackendConfig" : {
"backendName" : "mindieservice_llm_engine",
"modelInstanceNumber" : 1,
"npuDeviceIds" : [[0,1,2,3]],
"tokenizerProcessNumber" : 8,
"multiNodesInferEnabled" : false,
"multiNodesInferPort" : 1120,
"interNodeTLSEnabled" : true,
"interNodeTlsCaPath" : "security/grpc/ca/",
"interNodeTlsCaFiles" : ["ca.pem"],
"interNodeTlsCert" : "security/grpc/certs/server.pem",
"interNodeTlsPk" : "security/grpc/keys/server.key.pem",
"interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
"interNodeTlsCrlPath" : "security/grpc/certs/",
"interNodeTlsCrlFiles" : ["server_crl.pem"],
"interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
"interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
"kvPoolConfig" : {"backend":"", "configPath":""},
"ModelDeployConfig" :
{
"maxSeqLen" : 2560,
"maxInputTokenLen" : 2048,
"truncation" : false,
"ModelConfig" : [
{
"modelInstanceType" : "Standard",
"modelName" : "Qwen3-32B",
"modelWeightPath" : "/models/Qwen3-32B",
"worldSize" : 4,
"cpuMemSize" : 0,
"npuMemSize" : -1,
"backendType" : "atb",
"trustRemoteCode" : false,
"async_scheduler_wait_time": 120,
"kv_trans_timeout": 10,
"kv_link_timeout": 1080
}
]
},
"ScheduleConfig" :
{
"templateType" : "Standard",
"templateName" : "Standard_LLM",
"cacheBlockSize" : 128,
"maxPrefillBatchSize" : 50,
"maxPrefillTokens" : 8192,
"prefillTimeMsPerReq" : 150,
"prefillPolicyType" : 0,
"decodeTimeMsPerReq" : 50,
"decodePolicyType" : 0,
"maxBatchSize" : 200,
"maxIterTimes" : 512,
"maxPreemptCount" : 0,
"supportSelectBatch" : true,
"maxQueueDelayMicroseconds" : 5000,
"maxFirstTokenWaitTime": 2500
}
},
"LogConfig": {
"dynamicLogLevel" : "",
"dynamicLogLevelValidHours" : 2,
"dynamicLogLevelValidTime" : ""
},
"EnableDynamicAdjustTimeoutConfig": false
}
EOF
chmod 640 ./config/config.json
修改常用参数
"ipAddress" : "172.16.0.101" # 业务ip地址 "httpsEnabled" : false # 忽略https的通信 "npuDeviceIds" : [[0,1,2,3]] # 启用卡位, 按挂载卡算, 挂载4张就是0,1,2,3 "modelName" ="Qwen3-32B" # 模型名称 "modelWeightPath" = "/model/Qwen3-32B" # 模型路径 "worldSize" : 4 # 多卡数量 "maxSeqLen" : # 最大序列长度 "maxInputTokenLen": # 最大输入token数 "maxIterTimes": # 模型最大输出token数 "supportSelectBatch" : true # 建议打开 PS: maxSeqLen = maxInputTokenLen + maxIterTimes:最大序列长度 maxPrefillTokens = maxInputTokenLen:预填充最大token数:
容器编排
# docker-compose编排文件
cd /data/Qwen3-32B
cat > docker-compose.yml << EOF
services:
Qwen3-32B:
image: swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:2.3.0-800I-A2-py311-openeuler24.03-lts
hostname: Qwen3-32B
container_name: Qwen3-32B
network_mode: "host"
shm_size: '256gb'
entrypoint: ["/bin/bash", "/scripts/entrypoint.sh"]
devices:
- "/dev/davinci_manager:/dev/davinci_manager:rwm"
- "/dev/hisi_hdc:/dev/hisi_hdc:rwm"
- "/dev/devmm_svm:/dev/devmm_svm:rwm"
- "/dev/davinci4:/dev/davinci4"
- "/dev/davinci5:/dev/davinci5"
- "/dev/davinci6:/dev/davinci6"
- "/dev/davinci7:/dev/davinci7"
environment:
- TZ=Asia/Shanghai
- MIES_SERVICE_MONITOR_MODE=1
- ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
volumes:
- /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
- /usr/local/Ascend/firmware/:/usr/local/Ascend/firmware:ro
- /usr/local/sbin:/usr/local/sbin:ro
- ./models:/models # 模型目录
- ./scripts:/scripts
- ./config/config.json:/usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
EOF
初始化脚本
cd /data/Qwen3-32B
cat > scripts/entrypoint.sh << 'EOF'
#!/bin/bash
# 1. 激活环境 (确保路径正确)
echo "Initializing CANN environment..."
source /usr/local/Ascend/ascend-toolkit/set_env.sh
echo "Initializing MindIE environment..."
source /usr/local/Ascend/mindie/set_env.sh
# 2. 启动 MindIE Service
echo "Starting MindIE Service..."
cd /usr/local/Ascend/mindie/latest/mindie-service/bin
./mindieservice_daemon
# 如果 mindieservice_daemon 是个非阻塞命令,
# 为了防止脚本退出,可以加上下面这句:
# tail -f /dev/null
EOF
启动容器
# 启动服务
cd /data/Qwen3-32B
docker-compose up -d
# 查看日志
docker logs -f Qwen3-32B
API调用验证
# 查看模型
curl http://172.16.0.101:18025/v1/models
# 验证API
curl http://172.16.0.101:18025/generate -d '{
"prompt": "你是什么模型",
"maxtokens": 150,
"stream": false,
"dosample": true,
"repetitionpenalty": 1.5,
"temperature": 0.7,
"topp": 0.95,
"topk": 100,
"model": "Qwen3-32B"
}'
curl http://172.16.0.101:18025/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen3-32B",
"messages": [{"role": "user", "content": "你是什么模型"}],
"temperature": 0.7
}'
BenchMark测试
测试文件下载
#拷贝GSM8K数据集文件到models目录下
ls -1 models/
Qwen3-32B
GSM8K-in3500-bs8000-qwen3
benchmark测试
docker exec -it Qwen3-32B /bin/bash
chmod 640 /usr/local/lib/python3.11/site-packages/mindiebenchmark/config/config.json
chmod 640 /usr/local/lib/python3.11/site-packages/mindieclient/python/config/config.json
benchmark \
--DatasetPath "/models/GSM8K-in3500-bs8000-qwen3" \
--DatasetType "gsm8k" \
--ModelName "Qwen3-32B" \
--ModelPath "/models/Qwen3-32B" \
--TestType vllm_client \
--Concurrency 1 \
--Tokenizer True \
--Http http://172.16.0.101:18025 \
--TestAccuracy True \
--MaxOutputLen 1
--SavePath ./benchmark_results01
显卡常用命令
# 安装信息
npu-smi info
# 查看模块加载情况
lsmod | grep davinci
# 查看固件版本号
npu-smi info -t board -i 0
# 查看显卡被使用情况
cat /proc/uda/namespace_node
更多推荐

所有评论(0)