Skip to content

Instantly share code, notes, and snippets.

@yankay
Last active July 24, 2025 10:07
Show Gist options
  • Save yankay/3193834858f688f1f062e3356832a77f to your computer and use it in GitHub Desktop.
Save yankay/3193834858f688f1f062e3356832a77f to your computer and use it in GitHub Desktop.
创建一个 minikube 环境,并支持 “共享”GPU
# 配置HTTP代理(关键)
source ~/proxy.sh
#!/bin/bash
# 清理旧集群并启动新集群
minikube delete
minikube start \
--driver docker \
--container-runtime docker \
--gpus all \
--memory no-limit \
--cpus no-limit \
--force \
--mount \
--mount-string /root/.cache:/root/.cache
# 确保 minikube 环境初始化完成
minikube status || { echo "Minikube 启动失败"; exit 1; }
# 删除旧版 GPU 插件
kubectl -n kube-system delete daemonset nvidia-device-plugin-daemonset
# 创建特权命名空间
kubectl create ns gpu-operator
kubectl label --overwrite ns gpu-operator pod-security.kubernetes.io/enforce=privileged
# 添加并更新 Helm 仓库
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
helm repo update
# 安装 GPU Operator
helm upgrade --install --wait gpu-operator \
-n gpu-operator \
nvidia/gpu-operator
# 配置 GPU 时间切片
kubectl apply -n gpu-operator -f - <<EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: time-slicing-config
data:
any: |-
version: v1
sharing:
timeSlicing:
renameByDefault: false
resources:
- name: nvidia.com/gpu
replicas: 10
EOF
# 应用时间切片策略
kubectl patch clusterpolicies.nvidia.com/cluster-policy \
-n gpu-operator \
--type merge \
-p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}'
# 检查 GPU 分配情况
# 设置检查参数
timeout=300 # 最长等待时间(5分钟)
interval=10 # 检查间隔(秒)
start_time=$(date +%s)
all_nodes_ready=false
echo "⏳ 开始持续监测GPU资源分配状态..."
# 循环检查直到所有节点GPU可分配 >0
until $all_nodes_ready; do
current_time=$(date +%s)
elapsed=$((current_time - start_time))
# 超时检查
if [ $elapsed -ge $timeout ]; then
echo -e "\n⌛️ 等待超时($timeout 秒)!部分节点GPU可能未就绪"
break
fi
all_nodes_ready=true
nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}')
# 实时状态显示
echo -e "\n===== 第 $((elapsed/interval+1)) 轮检查 (已等待 ${elapsed} 秒) ====="
for node in $nodes; do
allocatable_gpu=$(kubectl describe node "$node" 2>/dev/null | awk '/Allocatable:/ {trigger=1} trigger && /nvidia.com\/gpu:/ {print $2; exit}')
# 验证是否为有效正数
if [[ -n "$allocatable_gpu" && "$allocatable_gpu" =~ ^[0-9]+$ ]]; then
if [ $allocatable_gpu -gt 0 ]; then
echo "✅ 节点 $node: ${allocatable_gpu} GPU可用"
else
echo "⏳ 节点 $node: 等待GPU激活 (当前:0)"
all_nodes_ready=false
fi
else
echo "❓ 节点 $node: 数据异常 (当前:'${allocatable_gpu:-未检测到}')"
all_nodes_ready=false
fi
done
# 未就绪时显示等待提示
if ! $all_nodes_ready; then
remaining=$((timeout - elapsed))
echo -e "\n⏱️ 将在 ${interval} 秒后重检 (超时倒计时: ${remaining}秒)"
sleep $interval
fi
done
# 最终状态报告
if $all_nodes_ready; then
echo -e "\n🎉 所有节点GPU资源就绪!当前分配详情:"
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: {.status.allocatable}{"\n"}{end}' | grep nvidia.com/gpu
else
echo -e "\n❗️ 部分节点未就绪,请诊断:"
echo "1. 检查Operator状态: kubectl get pods -n gpu-operator"
echo "2. 查看节点详情: kubectl describe node minikube | grep -A 15 Allocatable"
echo "3. 排查驱动日志: kubectl logs -n gpu-operator -l app=nvidia-device-plugin-daemonset"
fi
# 值得注意的是,在一块 GPU 上面运行模型,还是需要调参数的。
# 1. 拉取并保存Docker镜像
echo "🚀 拉取并保存Docker镜像..."
docker pull lmsysorg/sglang:v0.4.9.post3-cu126
mkdir -p /tmp/lmsysorg
if [ ! -f "/tmp/lmsysorg/sglang:v0.4.9.post3-cu126.tar" ]; then
echo "💾 保存镜像为tar文件..."
docker save lmsysorg/sglang:v0.4.9.post3-cu126 -o /tmp/lmsysorg/sglang:v0.4.9.post3-cu126.tar
else
echo "✅ 镜像tar文件已存在,跳过保存步骤"
fi
echo "🔍 检查Minikube镜像"
if minikube image ls | grep -q "lmsysorg/sglang:v0.4.9.post3-cu126"; then
echo "✅ 镜像 lmsysorg/sglang:v0.4.9.post3-cu126 已存在,跳过加载"
else
echo "⬆️ 加载镜像到Minikube..."
minikube image load /tmp/lmsysorg/sglang:v0.4.9.post3-cu126.tar --remote=false --daemon=false
fi
# 3. 应用Kubernetes配置
echo "⚙️ 部署Kubernetes资源..."
cat <<EOF | kubectl apply -f -
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: qwen25-05b-instruct-sglang
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: qwen25-05b-instruct-sglang
template:
metadata:
labels:
app: qwen25-05b-instruct-sglang
model: qwen25-05b-instruct
engine: sglang
spec:
restartPolicy: Always
runtimeClassName: nvidia
containers:
- name: qwen25-05b-instruct-sglang
image: docker.io/lmsysorg/sglang:v0.4.9.post3-cu126
imagePullPolicy: Always
ports:
- containerPort: 30000
command: ["python3", "-m", "sglang.launch_server"]
args: ["--model-path", "Qwen/Qwen2.5-0.5B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
env:
- name: SGLANG_USE_MODELSCOPE
value: "true"
env:
- name: HF_ENDPOINT
value: "https://hf-mirror.com"
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: shm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/modelscope
readOnly: true
- name: localtime
mountPath: /etc/localtime
readOnly: true
volumes:
- name: shm
emptyDir:
medium: Memory
sizeLimit: 5Gi
- name: hf-cache
hostPath:
path: /root/.cache/modelscope
type: Directory
- name: localtime
hostPath:
path: /etc/localtime
type: File
---
apiVersion: v1
kind: Service
metadata:
name: qwen25-05b-instruct-sglang
spec:
selector:
app: qwen25-05b-instruct-sglang
ports:
- protocol: TCP
port: 30000
targetPort: 30000
EOF
# 启动端口转发(参数硬编码)
echo "🔓 暴露服务端口 (30000 → qwen25-05b-instruct-sglang:30000)..."
kubectl port-forward service/qwen25-05b-instruct-sglang 30000:30000 \
--address 0.0.0.0 --namespace default > /dev/null 2>&1 &
FORWARD_PID=$!
# 等待端口初始化(2秒通常足够)
sleep 2
# 发送测试请求(URL硬编码)
echo -e "\n🧪 发送测试请求到模型服务 (http://127.0.0.1:30000)"
curl -s http://127.0.0.1:30000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen2.5-0.5B-Instruct",
"messages": [{"role": "user", "content": "What is the capital of France?"}]
}'
# 自动清理
echo -e "\n🛑 停止端口转发 (PID: $FORWARD_PID)"
kill $FORWARD_PID
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment