Last active
July 24, 2025 10:07
-
-
Save yankay/3193834858f688f1f062e3356832a77f to your computer and use it in GitHub Desktop.
创建一个 minikube 环境,并支持 “共享”GPU
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 配置HTTP代理(关键) | |
source ~/proxy.sh | |
#!/bin/bash | |
# 清理旧集群并启动新集群 | |
minikube delete | |
minikube start \ | |
--driver docker \ | |
--container-runtime docker \ | |
--gpus all \ | |
--memory no-limit \ | |
--cpus no-limit \ | |
--force \ | |
--mount \ | |
--mount-string /root/.cache:/root/.cache | |
# 确保 minikube 环境初始化完成 | |
minikube status || { echo "Minikube 启动失败"; exit 1; } | |
# 删除旧版 GPU 插件 | |
kubectl -n kube-system delete daemonset nvidia-device-plugin-daemonset | |
# 创建特权命名空间 | |
kubectl create ns gpu-operator | |
kubectl label --overwrite ns gpu-operator pod-security.kubernetes.io/enforce=privileged | |
# 添加并更新 Helm 仓库 | |
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia | |
helm repo update | |
# 安装 GPU Operator | |
helm upgrade --install --wait gpu-operator \ | |
-n gpu-operator \ | |
nvidia/gpu-operator | |
# 配置 GPU 时间切片 | |
kubectl apply -n gpu-operator -f - <<EOF | |
apiVersion: v1 | |
kind: ConfigMap | |
metadata: | |
name: time-slicing-config | |
data: | |
any: |- | |
version: v1 | |
sharing: | |
timeSlicing: | |
renameByDefault: false | |
resources: | |
- name: nvidia.com/gpu | |
replicas: 10 | |
EOF | |
# 应用时间切片策略 | |
kubectl patch clusterpolicies.nvidia.com/cluster-policy \ | |
-n gpu-operator \ | |
--type merge \ | |
-p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}' | |
# 检查 GPU 分配情况 | |
# 设置检查参数 | |
timeout=300 # 最长等待时间(5分钟) | |
interval=10 # 检查间隔(秒) | |
start_time=$(date +%s) | |
all_nodes_ready=false | |
echo "⏳ 开始持续监测GPU资源分配状态..." | |
# 循环检查直到所有节点GPU可分配 >0 | |
until $all_nodes_ready; do | |
current_time=$(date +%s) | |
elapsed=$((current_time - start_time)) | |
# 超时检查 | |
if [ $elapsed -ge $timeout ]; then | |
echo -e "\n⌛️ 等待超时($timeout 秒)!部分节点GPU可能未就绪" | |
break | |
fi | |
all_nodes_ready=true | |
nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}') | |
# 实时状态显示 | |
echo -e "\n===== 第 $((elapsed/interval+1)) 轮检查 (已等待 ${elapsed} 秒) =====" | |
for node in $nodes; do | |
allocatable_gpu=$(kubectl describe node "$node" 2>/dev/null | awk '/Allocatable:/ {trigger=1} trigger && /nvidia.com\/gpu:/ {print $2; exit}') | |
# 验证是否为有效正数 | |
if [[ -n "$allocatable_gpu" && "$allocatable_gpu" =~ ^[0-9]+$ ]]; then | |
if [ $allocatable_gpu -gt 0 ]; then | |
echo "✅ 节点 $node: ${allocatable_gpu} GPU可用" | |
else | |
echo "⏳ 节点 $node: 等待GPU激活 (当前:0)" | |
all_nodes_ready=false | |
fi | |
else | |
echo "❓ 节点 $node: 数据异常 (当前:'${allocatable_gpu:-未检测到}')" | |
all_nodes_ready=false | |
fi | |
done | |
# 未就绪时显示等待提示 | |
if ! $all_nodes_ready; then | |
remaining=$((timeout - elapsed)) | |
echo -e "\n⏱️ 将在 ${interval} 秒后重检 (超时倒计时: ${remaining}秒)" | |
sleep $interval | |
fi | |
done | |
# 最终状态报告 | |
if $all_nodes_ready; then | |
echo -e "\n🎉 所有节点GPU资源就绪!当前分配详情:" | |
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: {.status.allocatable}{"\n"}{end}' | grep nvidia.com/gpu | |
else | |
echo -e "\n❗️ 部分节点未就绪,请诊断:" | |
echo "1. 检查Operator状态: kubectl get pods -n gpu-operator" | |
echo "2. 查看节点详情: kubectl describe node minikube | grep -A 15 Allocatable" | |
echo "3. 排查驱动日志: kubectl logs -n gpu-operator -l app=nvidia-device-plugin-daemonset" | |
fi |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 值得注意的是,在一块 GPU 上面运行模型,还是需要调参数的。 | |
# 1. 拉取并保存Docker镜像 | |
echo "🚀 拉取并保存Docker镜像..." | |
docker pull lmsysorg/sglang:v0.4.9.post3-cu126 | |
mkdir -p /tmp/lmsysorg | |
if [ ! -f "/tmp/lmsysorg/sglang:v0.4.9.post3-cu126.tar" ]; then | |
echo "💾 保存镜像为tar文件..." | |
docker save lmsysorg/sglang:v0.4.9.post3-cu126 -o /tmp/lmsysorg/sglang:v0.4.9.post3-cu126.tar | |
else | |
echo "✅ 镜像tar文件已存在,跳过保存步骤" | |
fi | |
echo "🔍 检查Minikube镜像" | |
if minikube image ls | grep -q "lmsysorg/sglang:v0.4.9.post3-cu126"; then | |
echo "✅ 镜像 lmsysorg/sglang:v0.4.9.post3-cu126 已存在,跳过加载" | |
else | |
echo "⬆️ 加载镜像到Minikube..." | |
minikube image load /tmp/lmsysorg/sglang:v0.4.9.post3-cu126.tar --remote=false --daemon=false | |
fi | |
# 3. 应用Kubernetes配置 | |
echo "⚙️ 部署Kubernetes资源..." | |
cat <<EOF | kubectl apply -f - | |
apiVersion: node.k8s.io/v1 | |
kind: RuntimeClass | |
metadata: | |
name: nvidia | |
handler: nvidia | |
--- | |
apiVersion: apps/v1 | |
kind: Deployment | |
metadata: | |
name: qwen25-05b-instruct-sglang | |
spec: | |
replicas: 1 | |
strategy: | |
type: Recreate | |
selector: | |
matchLabels: | |
app: qwen25-05b-instruct-sglang | |
template: | |
metadata: | |
labels: | |
app: qwen25-05b-instruct-sglang | |
model: qwen25-05b-instruct | |
engine: sglang | |
spec: | |
restartPolicy: Always | |
runtimeClassName: nvidia | |
containers: | |
- name: qwen25-05b-instruct-sglang | |
image: docker.io/lmsysorg/sglang:v0.4.9.post3-cu126 | |
imagePullPolicy: Always | |
ports: | |
- containerPort: 30000 | |
command: ["python3", "-m", "sglang.launch_server"] | |
args: ["--model-path", "Qwen/Qwen2.5-0.5B-Instruct", "--host", "0.0.0.0", "--port", "30000"] | |
env: | |
- name: SGLANG_USE_MODELSCOPE | |
value: "true" | |
env: | |
- name: HF_ENDPOINT | |
value: "https://hf-mirror.com" | |
resources: | |
limits: | |
nvidia.com/gpu: 1 | |
volumeMounts: | |
- name: shm | |
mountPath: /dev/shm | |
- name: hf-cache | |
mountPath: /root/.cache/modelscope | |
readOnly: true | |
- name: localtime | |
mountPath: /etc/localtime | |
readOnly: true | |
volumes: | |
- name: shm | |
emptyDir: | |
medium: Memory | |
sizeLimit: 5Gi | |
- name: hf-cache | |
hostPath: | |
path: /root/.cache/modelscope | |
type: Directory | |
- name: localtime | |
hostPath: | |
path: /etc/localtime | |
type: File | |
--- | |
apiVersion: v1 | |
kind: Service | |
metadata: | |
name: qwen25-05b-instruct-sglang | |
spec: | |
selector: | |
app: qwen25-05b-instruct-sglang | |
ports: | |
- protocol: TCP | |
port: 30000 | |
targetPort: 30000 | |
EOF | |
# 启动端口转发(参数硬编码) | |
echo "🔓 暴露服务端口 (30000 → qwen25-05b-instruct-sglang:30000)..." | |
kubectl port-forward service/qwen25-05b-instruct-sglang 30000:30000 \ | |
--address 0.0.0.0 --namespace default > /dev/null 2>&1 & | |
FORWARD_PID=$! | |
# 等待端口初始化(2秒通常足够) | |
sleep 2 | |
# 发送测试请求(URL硬编码) | |
echo -e "\n🧪 发送测试请求到模型服务 (http://127.0.0.1:30000)" | |
curl -s http://127.0.0.1:30000/v1/chat/completions \ | |
-H "Content-Type: application/json" \ | |
-d '{ | |
"model": "Qwen/Qwen2.5-0.5B-Instruct", | |
"messages": [{"role": "user", "content": "What is the capital of France?"}] | |
}' | |
# 自动清理 | |
echo -e "\n🛑 停止端口转发 (PID: $FORWARD_PID)" | |
kill $FORWARD_PID |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment