2단계. 리소스 배포¶
Deployment, Service, VirtualService 순서로 배포합니다. 각 yaml 파일을 생성한 후 kubectl apply 명령으로 적용합니다.
1. Deployment¶
qwen3-vl-235b-a22b-instruct 예시¶
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
name: onprem-llm-qwen3-vl-235b-a22b-instruct
namespace: llm
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
app.kubernetes.io/name: vllm
template:
metadata:
labels:
app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
app.kubernetes.io/name: vllm
spec:
containers:
- args:
- vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct --tensor-parallel-size 8
--mm-encoder-tp-mode data --enable-expert-parallel --dtype bfloat16
--gpu-memory-utilization 0.90 --max-model-len 41960
--max-num-batched-tokens 128 --distributed-executor-backend mp
command:
- /bin/sh
- -c
env:
- name: HF_MODEL_ID
value: Qwen/Qwen3-VL-235B-A22B-Instruct
- name: HF_HUB_OFFLINE
value: "1"
- name: TRANSFORMERS_OFFLINE
value: "1"
- name: HF_DATASETS_OFFLINE
value: "1"
- name: HF_HOME
value: /root/.cache/huggingface
- name: TIKTOKEN_ENCODINGS_BASE
value: /root/.cache/encodings
image: harbor.add.re.kr/library/vllm-openai:v0.12.0
imagePullPolicy: IfNotPresent
name: vllm-server
ports:
- containerPort: 8000
protocol: TCP
resources:
limits:
cpu: "32"
memory: 256Gi
nvidia.com/gpu: "8"
requests:
cpu: "8"
memory: 8Gi
nvidia.com/gpu: "8"
volumeMounts:
- mountPath: /root/.cache
name: cache-volume
- mountPath: /dev/shm
name: shm-volume
restartPolicy: Always
volumes:
- name: cache-volume
persistentVolumeClaim:
claimName: vllm-cache
- emptyDir:
medium: Memory
sizeLimit: 2Gi
name: shm-volume
z-image-turbo 예시¶
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/instance: onprem-llm-z-image-turbo
name: onprem-llm-z-image-turbo
namespace: llm
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/instance: onprem-llm-z-image-turbo
app.kubernetes.io/name: vllm
template:
metadata:
labels:
app.kubernetes.io/instance: onprem-llm-z-image-turbo
app.kubernetes.io/name: vllm
spec:
containers:
- args:
- |
vllm serve /root/.cache/huggingface/hub/models--Tongyi-MAI--Z-Image-Turbo/snapshots/0e36c2b379e66fa531d01cc531c44919e5f1c6fd
--tensor-parallel-size 1 --omni --port 8000
command:
- /bin/sh
- -c
env:
- name: HF_MODEL_ID
value: Tongyi-MAI/Z-Image-Turbo
- name: HF_HUB_OFFLINE
value: "1"
- name: TRANSFORMERS_OFFLINE
value: "1"
- name: HF_DATASETS_OFFLINE
value: "1"
- name: HF_HOME
value: /root/.cache/huggingface
- name: TIKTOKEN_ENCODINGS_BASE
value: /root/.cache/encodings
image: harbor.add.re.kr/library/vllm-omni:v0.12.0rc1
imagePullPolicy: IfNotPresent
name: vllm-server
ports:
- containerPort: 8000
protocol: TCP
resources:
limits:
cpu: "4"
memory: 8Gi
nvidia.com/gpu: "1"
requests:
cpu: "4"
memory: 4Gi
nvidia.com/gpu: "1"
volumeMounts:
- mountPath: /root/.cache
name: cache-volume
restartPolicy: Always
volumes:
- name: cache-volume
persistentVolumeClaim:
claimName: vllm-cache
kubectl apply -f deployment.yaml
2. Service¶
qwen3-vl-235b-a22b-instruct 예시¶
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
name: onprem-llm-qwen3-vl-235b-a22b-instruct
namespace: llm
spec:
ports:
- name: http
port: 8000
protocol: TCP
targetPort: 8000
selector:
app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
app.kubernetes.io/name: vllm
type: ClusterIP
z-image-turbo 예시¶
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/instance: onprem-llm-z-image-turbo
name: onprem-llm-z-image-turbo
namespace: llm
spec:
ports:
- name: http
port: 8000
protocol: TCP
targetPort: 8000
selector:
app.kubernetes.io/instance: onprem-llm-z-image-turbo
app.kubernetes.io/name: vllm
type: ClusterIP
kubectl apply -f service.yaml
3. VirtualService¶
qwen3-vl-235b-a22b-instruct 예시¶
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: qwen3-vl-235b-a22b-instruct
namespace: llm
spec:
gateways:
- istio-system/runway-ingress-gateway
hosts:
- qwen3-vl-235b-a22b-instruct.runway.add.re.kr
http:
- match:
- uri:
prefix: /
route:
- destination:
host: onprem-llm-qwen3-vl-235b-a22b-instruct
port:
number: 8000
z-image-turbo 예시¶
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: z-image-turbo
namespace: llm
spec:
gateways:
- istio-system/runway-ingress-gateway
hosts:
- z-image-turbo.runway.add.re.kr
http:
- match:
- uri:
prefix: /
route:
- destination:
host: onprem-llm-z-image-turbo
port:
number: 8000
kubectl apply -f virtualservice.yaml