# AWS EKS GPU集群配置文件
# 使用eksctl工具部署GPU集群
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
  name: gpu-cluster
  region: us-west-2
  version: "1.28"

nodeGroups:
  - name: gpu-workers
    instanceType: p3.2xlarge
    desiredCapacity: 2
    minSize: 1
    maxSize: 5
    volumeSize: 100
    volumeType: gp3
    labels:
      node-type: gpu
      accelerator: nvidia-tesla-v100
    taints:
      - key: nvidia.com/gpu
        effect: NoSchedule
        value: "true"
    iam:
      withAddonPolicies:
        autoScaler: true
        cloudWatch: true
        ebs: true
        efs: true
        fsx: true
    ssh:
      allow: true
      publicKeyName: my-gpu-cluster-key
    preBootstrapCommands:
      - "yum install -y nvidia-docker2"
      - "systemctl restart docker"
    overrideBootstrapCommand: |
      #!/bin/bash
      /etc/eks/bootstrap.sh gpu-cluster
      /opt/aws/bin/cfn-signal --exit-code $? --stack ${AWS::StackName} --resource NodeGroup --region ${AWS::Region}

addons:
  - name: vpc-cni
    version: latest
  - name: coredns
    version: latest
  - name: kube-proxy
    version: latest
  - name: aws-ebs-csi-driver
    version: latest

cloudWatch:
  clusterLogging:
    enable: true
    logTypes:
      ["api", "audit", "authenticator", "controllerManager", "scheduler"]

vpc:
  cidr: "10.0.0.0/16"
  nat:
    gateway: Single
  clusterEndpoints:
    privateAccess: true
    publicAccess: true
    publicAccessCIDRs: ["0.0.0.0/0"]

fargateProfiles:
  - name: fp-default
    selectors:
      - namespace: default
      - namespace: kube-system

secretEncryption:
  keyARN: "arn:aws:kms:us-west-2:123456789012:key/12345678-1234-1234-1234-123456789012"
