环境准备

docker安装

# 配置内核参数
echo "net.bridge.bridge-nf-call-ip6tables = 1" >> /etc/sysctl.conf
echo "net.bridge.bridge-nf-call-iptables=1" >> /etc/sysctl.conf
echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf
echo "1" >/proc/sys/net/bridge/bridge-nf-call-iptables
sysctl -p

# 查看docker版本
dnf list docker-ce --showduplicates | sort -r

# 安装docker
dnf install docker-ce-26.1.3-1.el8 -y

# 安装docker-compose
wget "https://github.com/docker/compose/releases/download/v2.40.3/docker-compose-$(uname -s)-$(uname -m)"
chmod +x docker-compose-Linux-x86_64
mv docker-compose-Linux-x86_64 /usr/bin/docker-compose

镜像下载

MindIE:(全称 Mind Inference Engine,昇腾推理引擎)是华为昇腾(Ascend)软件栈中专门用于 AI 推理加速 的核心组件。

# 下载MindIE镜像
docker pull --platform linux/arm64 swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:.3.0-800I-A2-py311-openeuler24.03-lts

模型下载

  • 魔塔社区模型库下载Qwen/Qwen3-32B
# 安装modelscope
python -m ensurepip --upgrade
python -m pip install modelscope -i https://pypi.tuna.tsinghua.edu.cn/simple

## 下载Qwen3-32B
python -m modelscope.cli.cli download --model Qwen/Qwen3-32B

模型部署

# 创建目录
mkdir -pv /data/Qwen3-32B/{images,models,config,scripts}
cd /data/Qwen3-32B

# 模型拷贝到models
ls -1 models/
Qwen3-32B

# 修改模型配置
vim Qwen3-32B/config.json
## 修改如下内容
  "torch_dtype": "float16",

# 把镜像拷贝到images
ls -1 images/
mindie.tar

# 导入镜像(mindie镜像为通用镜像, 后面几个是用mindie镜像启动的模型可以省略此步骤)
ls *.tar | xargs -I {} docker load -i {}

mindie配置

# mindie-service配置文件持久化
cd /data/Qwen3-32B
cat > ./config/config.json << EOF
{
    "Version" : "1.0.0",

    "ServerConfig" :
    {
        "ipAddress" : "172.16.0.101",
        "managementIpAddress" : "172.16.0.101",
        "port" : 18025,
        "managementPort" : 18026,
        "metricsPort" : 18027,
        "allowAllZeroIpListening" : false,
        "maxLinkNum" : 1000,
        "httpsEnabled" : false,
        "fullTextEnabled" : false,
        "tlsCaPath" : "security/ca/",
        "tlsCaFile" : ["ca.pem"],
        "tlsCert" : "security/certs/server.pem",
        "tlsPk" : "security/keys/server.key.pem",
        "tlsPkPwd" : "security/pass/key_pwd.txt",
        "tlsCrlPath" : "security/certs/",
        "tlsCrlFiles" : ["server_crl.pem"],
        "managementTlsCaFile" : ["management_ca.pem"],
        "managementTlsCert" : "security/certs/management/server.pem",
        "managementTlsPk" : "security/keys/management/server.key.pem",
        "managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
        "managementTlsCrlPath" : "security/management/certs/",
        "managementTlsCrlFiles" : ["server_crl.pem"],
        "metricsTlsCaFile" : ["metrics_ca.pem"],
        "metricsTlsCert" : "security/certs/metrics/server.pem",
        "metricsTlsPk" : "security/keys/metrics/server.key.pem",
        "metricsTlsPkPwd" : "security/pass/metrics/key_pwd.txt",
        "metricsTlsCrlPath" : "security/metrics/certs/",
        "metricsTlsCrlFiles" : ["server_crl.pem"],
        "kmcKsfMaster" : "tools/pmt/master/ksfa",
        "kmcKsfStandby" : "tools/pmt/standby/ksfb",
        "inferMode" : "standard",
        "interCommTLSEnabled" : true,
        "interCommPort" : 1121,
        "interCommTlsCaPath" : "security/grpc/ca/",
        "interCommTlsCaFiles" : ["ca.pem"],
        "interCommTlsCert" : "security/grpc/certs/server.pem",
        "interCommPk" : "security/grpc/keys/server.key.pem",
        "interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
        "interCommTlsCrlPath" : "security/grpc/certs/",
        "interCommTlsCrlFiles" : ["server_crl.pem"],
        "openAiSupport" : "vllm",
        "tokenTimeout" : 600,
        "e2eTimeout" : 600,
        "distDPServerEnabled":false
    },

    "BackendConfig" : {
        "backendName" : "mindieservice_llm_engine",
        "modelInstanceNumber" : 1,
        "npuDeviceIds" : [[0,1,2,3]],
        "tokenizerProcessNumber" : 8,
        "multiNodesInferEnabled" : false,
        "multiNodesInferPort" : 1120,
        "interNodeTLSEnabled" : true,
        "interNodeTlsCaPath" : "security/grpc/ca/",
        "interNodeTlsCaFiles" : ["ca.pem"],
        "interNodeTlsCert" : "security/grpc/certs/server.pem",
        "interNodeTlsPk" : "security/grpc/keys/server.key.pem",
        "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
        "interNodeTlsCrlPath" : "security/grpc/certs/",
        "interNodeTlsCrlFiles" : ["server_crl.pem"],
        "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
        "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
        "kvPoolConfig" : {"backend":"", "configPath":""},
        "ModelDeployConfig" :
        {
            "maxSeqLen" : 2560,
            "maxInputTokenLen" : 2048,
            "truncation" : false,
            "ModelConfig" : [
                {
                    "modelInstanceType" : "Standard",
                    "modelName" : "Qwen3-32B",
                    "modelWeightPath" : "/models/Qwen3-32B",
                    "worldSize" : 4,
                    "cpuMemSize" : 0,
                    "npuMemSize" : -1,
                    "backendType" : "atb",
                    "trustRemoteCode" : false,
                    "async_scheduler_wait_time": 120,
                    "kv_trans_timeout": 10,
                    "kv_link_timeout": 1080
                }
            ]
        },

        "ScheduleConfig" :
        {
            "templateType" : "Standard",
            "templateName" : "Standard_LLM",
            "cacheBlockSize" : 128,

            "maxPrefillBatchSize" : 50,
            "maxPrefillTokens" : 8192,
            "prefillTimeMsPerReq" : 150,
            "prefillPolicyType" : 0,

            "decodeTimeMsPerReq" : 50,
            "decodePolicyType" : 0,

            "maxBatchSize" : 200,
            "maxIterTimes" : 512,
            "maxPreemptCount" : 0,
            "supportSelectBatch" : true,
            "maxQueueDelayMicroseconds" : 5000,
            "maxFirstTokenWaitTime": 2500
        }
    },

    "LogConfig": {
        "dynamicLogLevel" : "",
        "dynamicLogLevelValidHours" : 2,
        "dynamicLogLevelValidTime" : ""
    },

    "EnableDynamicAdjustTimeoutConfig": false
}
EOF
chmod 640 ./config/config.json

修改常用参数

"ipAddress" : "172.16.0.101"                             # 业务ip地址
"httpsEnabled" : false                                # 忽略https的通信
"npuDeviceIds" : [[0,1,2,3]]                          # 启用卡位, 按挂载卡算, 挂载4张就是0,1,2,3
"modelName" ="Qwen3-32B"                              # 模型名称
"modelWeightPath" = "/model/Qwen3-32B"                # 模型路径
"worldSize" : 4                                       # 多卡数量
"maxSeqLen" :                                         # 最大序列长度
"maxInputTokenLen"# 最大输入token数
"maxIterTimes"# 模型最大输出token数
"supportSelectBatch" : true                           # 建议打开

PS:
maxSeqLen = maxInputTokenLen + maxIterTimes:最大序列长度
maxPrefillTokens = maxInputTokenLen:预填充最大token数:

容器编排

# docker-compose编排文件
cd /data/Qwen3-32B
cat > docker-compose.yml << EOF
services:
  Qwen3-32B:
    image: swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:2.3.0-800I-A2-py311-openeuler24.03-lts
    hostname: Qwen3-32B
    container_name: Qwen3-32B
    network_mode: "host"
    shm_size: '256gb'
    entrypoint: ["/bin/bash", "/scripts/entrypoint.sh"]
    devices:
      - "/dev/davinci_manager:/dev/davinci_manager:rwm"
      - "/dev/hisi_hdc:/dev/hisi_hdc:rwm"
      - "/dev/devmm_svm:/dev/devmm_svm:rwm"
      - "/dev/davinci4:/dev/davinci4"
      - "/dev/davinci5:/dev/davinci5"
      - "/dev/davinci6:/dev/davinci6"
      - "/dev/davinci7:/dev/davinci7"
    environment:
      - TZ=Asia/Shanghai
      - MIES_SERVICE_MONITOR_MODE=1
      - ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
    volumes:
      - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
      - /usr/local/Ascend/firmware/:/usr/local/Ascend/firmware:ro
      - /usr/local/sbin:/usr/local/sbin:ro
      - ./models:/models                            # 模型目录
      - ./scripts:/scripts
      - ./config/config.json:/usr/local/Ascend/mindie/latest/mindie-service/conf/config.json
EOF

初始化脚本

cd /data/Qwen3-32B
cat > scripts/entrypoint.sh << 'EOF'
#!/bin/bash

# 1. 激活环境 (确保路径正确)
echo "Initializing CANN environment..."
source /usr/local/Ascend/ascend-toolkit/set_env.sh

echo "Initializing MindIE environment..."
source /usr/local/Ascend/mindie/set_env.sh

# 2. 启动 MindIE Service
echo "Starting MindIE Service..."
cd /usr/local/Ascend/mindie/latest/mindie-service/bin
./mindieservice_daemon

# 如果 mindieservice_daemon 是个非阻塞命令,
# 为了防止脚本退出,可以加上下面这句:
# tail -f /dev/null
EOF

启动容器

# 启动服务
cd /data/Qwen3-32B
docker-compose up -d

# 查看日志
docker logs -f Qwen3-32B

API调用验证

# 查看模型
curl http://172.16.0.101:18025/v1/models

# 验证API
curl http://172.16.0.101:18025/generate -d '{
  "prompt": "你是什么模型",
  "maxtokens": 150,
  "stream": false,
  "dosample": true,
  "repetitionpenalty": 1.5,
  "temperature": 0.7,
  "topp": 0.95,
  "topk": 100,
  "model": "Qwen3-32B"
}'

curl http://172.16.0.101:18025/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen3-32B",
    "messages": [{"role": "user", "content": "你是什么模型"}],
    "temperature": 0.7
  }'

BenchMark测试

测试文件下载

#拷贝GSM8K数据集文件到models目录下
ls -1 models/
Qwen3-32B
GSM8K-in3500-bs8000-qwen3

benchmark测试

docker exec -it Qwen3-32B /bin/bash

chmod 640 /usr/local/lib/python3.11/site-packages/mindiebenchmark/config/config.json
chmod 640 /usr/local/lib/python3.11/site-packages/mindieclient/python/config/config.json

benchmark \
--DatasetPath "/models/GSM8K-in3500-bs8000-qwen3" \
--DatasetType "gsm8k" \
--ModelName "Qwen3-32B" \
--ModelPath "/models/Qwen3-32B" \
--TestType vllm_client \
--Concurrency 1 \
--Tokenizer True \
--Http http://172.16.0.101:18025 \
--TestAccuracy True \
--MaxOutputLen 1
--SavePath ./benchmark_results01

显卡常用命令

# 安装信息
npu-smi info

# 查看模块加载情况
lsmod | grep davinci

# 查看固件版本号
npu-smi info -t board -i 0

# 查看显卡被使用情况
cat /proc/uda/namespace_node
Logo

欢迎加入DeepSeek 技术社区。在这里,你可以找到志同道合的朋友,共同探索AI技术的奥秘。

更多推荐