Skip to content

2단계. 리소스 배포

Deployment, Service, VirtualService 순서로 배포합니다. 각 yaml 파일을 생성한 후 kubectl apply 명령으로 적용합니다.


1. Deployment

qwen3-vl-235b-a22b-instruct 예시

apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
  name: onprem-llm-qwen3-vl-235b-a22b-instruct
  namespace: llm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
      app.kubernetes.io/name: vllm
  template:
    metadata:
      labels:
        app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
        app.kubernetes.io/name: vllm
    spec:
      containers:
      - args:
        - vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct --tensor-parallel-size 8
          --mm-encoder-tp-mode data --enable-expert-parallel --dtype bfloat16
          --gpu-memory-utilization 0.90 --max-model-len 41960
          --max-num-batched-tokens 128 --distributed-executor-backend mp
        command:
        - /bin/sh
        - -c
        env:
        - name: HF_MODEL_ID
          value: Qwen/Qwen3-VL-235B-A22B-Instruct
        - name: HF_HUB_OFFLINE
          value: "1"
        - name: TRANSFORMERS_OFFLINE
          value: "1"
        - name: HF_DATASETS_OFFLINE
          value: "1"
        - name: HF_HOME
          value: /root/.cache/huggingface
        - name: TIKTOKEN_ENCODINGS_BASE
          value: /root/.cache/encodings
        image: harbor.add.re.kr/library/vllm-openai:v0.12.0
        imagePullPolicy: IfNotPresent
        name: vllm-server
        ports:
        - containerPort: 8000
          protocol: TCP
        resources:
          limits:
            cpu: "32"
            memory: 256Gi
            nvidia.com/gpu: "8"
          requests:
            cpu: "8"
            memory: 8Gi
            nvidia.com/gpu: "8"
        volumeMounts:
        - mountPath: /root/.cache
          name: cache-volume
        - mountPath: /dev/shm
          name: shm-volume
      restartPolicy: Always
      volumes:
      - name: cache-volume
        persistentVolumeClaim:
          claimName: vllm-cache
      - emptyDir:
          medium: Memory
          sizeLimit: 2Gi
        name: shm-volume

z-image-turbo 예시

apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app.kubernetes.io/instance: onprem-llm-z-image-turbo
  name: onprem-llm-z-image-turbo
  namespace: llm
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/instance: onprem-llm-z-image-turbo
      app.kubernetes.io/name: vllm
  template:
    metadata:
      labels:
        app.kubernetes.io/instance: onprem-llm-z-image-turbo
        app.kubernetes.io/name: vllm
    spec:
      containers:
      - args:
        - |
          vllm serve /root/.cache/huggingface/hub/models--Tongyi-MAI--Z-Image-Turbo/snapshots/0e36c2b379e66fa531d01cc531c44919e5f1c6fd
          --tensor-parallel-size 1 --omni --port 8000
        command:
        - /bin/sh
        - -c
        env:
        - name: HF_MODEL_ID
          value: Tongyi-MAI/Z-Image-Turbo
        - name: HF_HUB_OFFLINE
          value: "1"
        - name: TRANSFORMERS_OFFLINE
          value: "1"
        - name: HF_DATASETS_OFFLINE
          value: "1"
        - name: HF_HOME
          value: /root/.cache/huggingface
        - name: TIKTOKEN_ENCODINGS_BASE
          value: /root/.cache/encodings
        image: harbor.add.re.kr/library/vllm-omni:v0.12.0rc1
        imagePullPolicy: IfNotPresent
        name: vllm-server
        ports:
        - containerPort: 8000
          protocol: TCP
        resources:
          limits:
            cpu: "4"
            memory: 8Gi
            nvidia.com/gpu: "1"
          requests:
            cpu: "4"
            memory: 4Gi
            nvidia.com/gpu: "1"
        volumeMounts:
        - mountPath: /root/.cache
          name: cache-volume
      restartPolicy: Always
      volumes:
      - name: cache-volume
        persistentVolumeClaim:
          claimName: vllm-cache
kubectl apply -f deployment.yaml

2. Service

qwen3-vl-235b-a22b-instruct 예시

apiVersion: v1
kind: Service
metadata:
  labels:
    app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
  name: onprem-llm-qwen3-vl-235b-a22b-instruct
  namespace: llm
spec:
  ports:
  - name: http
    port: 8000
    protocol: TCP
    targetPort: 8000
  selector:
    app.kubernetes.io/instance: onprem-llm-qwen3-vl-235b-a22b-instruct
    app.kubernetes.io/name: vllm
  type: ClusterIP

z-image-turbo 예시

apiVersion: v1
kind: Service
metadata:
  labels:
    app.kubernetes.io/instance: onprem-llm-z-image-turbo
  name: onprem-llm-z-image-turbo
  namespace: llm
spec:
  ports:
  - name: http
    port: 8000
    protocol: TCP
    targetPort: 8000
  selector:
    app.kubernetes.io/instance: onprem-llm-z-image-turbo
    app.kubernetes.io/name: vllm
  type: ClusterIP
kubectl apply -f service.yaml

3. VirtualService

qwen3-vl-235b-a22b-instruct 예시

apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
  name: qwen3-vl-235b-a22b-instruct
  namespace: llm
spec:
  gateways:
  - istio-system/runway-ingress-gateway
  hosts:
  - qwen3-vl-235b-a22b-instruct.runway.add.re.kr
  http:
  - match:
    - uri:
        prefix: /
    route:
    - destination:
        host: onprem-llm-qwen3-vl-235b-a22b-instruct
        port:
          number: 8000

z-image-turbo 예시

apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
  name: z-image-turbo
  namespace: llm
spec:
  gateways:
  - istio-system/runway-ingress-gateway
  hosts:
  - z-image-turbo.runway.add.re.kr
  http:
  - match:
    - uri:
        prefix: /
    route:
    - destination:
        host: onprem-llm-z-image-turbo
        port:
          number: 8000
kubectl apply -f virtualservice.yaml