
【DeepSeek-R1】DeepSeek-R1-Distill-Qwen-1.5B 流式与非流式推理调用
windows 11 下,部署 DeepSeek-R1-Distill-Qwen-1.5B。为了根据用户的需要做了两种形式一种是流式生成一种是非流式。
·
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
from threading import Thread
class DeepSeekModel:
def __init__(self, model_path="D:\\Algorithm\\DeepSeek-R1-Distill-Qwen-1.5B\\DeepSeek-R1-Distill-Qwen-1.5B"):
# 设备自适应,根据 GPU 可用性选择设备
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
# 加载 tokenizer 和模型,添加稳定性相关参数
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto", # 显式指定设备映射
trust_remote_code=True,
torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
low_cpu_mem_usage=True, # 减少 CPU 内存使用
attn_implementation="eager" # 禁用可能不稳定的优化
).to(self.device)
# 设置模型为评估模式,避免不必要的梯度计算
self.model.eval()
def generate_response(self, prompt, max_length=2048, temperature=0.7, stream=False):
if stream:
# 流式生成
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True)
generation_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=max_length,
do_sample=True,
temperature=temperature,
top_p=0.9,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
thread.start()
for new_text in streamer:
yield new_text
else:
try:
# 非流式生成
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=max_length,
temperature=temperature,
top_p=0.9,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
except Exception as e:
print(f"生成回答时发生错误: {str(e)}")
return None
def main():
# 初始化模型
model = DeepSeekModel()
# 测试对话
while True:
user_input = input("\n请输入您的问题 (输入 'quit' 退出): ")
if user_input.lower() == 'quit':
break
stream_choice = input("是否使用流式输出?(y/n): ")
if stream_choice.lower() == 'y':
print("\nDeepSeek: ", end="")
for response in model.generate_response(user_input, stream=True):
print(response, end="", flush=True)
print()
else:
response = model.generate_response(user_input)
if response:
print("\nDeepSeek:", response)
if __name__ == "__main__":
main()
windows 11 下,部署 DeepSeek-R1-Distill-Qwen-1.5B 。为了根据用户的需要做了两种形式一种是流式生成一种是非流式。
点击阅读全文
更多推荐
所有评论(0)