Skip to content

Commit

Permalink
remove the specialness from GPU requests (aws#1489)
Browse files Browse the repository at this point in the history
* remove the specialness from GPU requests

This modifies Karpenter to treat resouce requests identically. It
removes the special logic for some AWS specific GPU types and
allows CloudProvider implementers to provide their own custom
resource types along with an ordering to be used as a hint for
binpacking regarding which instance types to prefer.

Fixes aws#1516
  • Loading branch information
tzneal authored Mar 21, 2022
1 parent f425d5c commit 3f28a21
Show file tree
Hide file tree
Showing 19 changed files with 532 additions and 297 deletions.
4 changes: 3 additions & 1 deletion pkg/cloudprovider/aws/amifamily/al2.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ package amifamily
import (
"fmt"

"github.com/aws/karpenter/pkg/utils/resources"

"github.com/aws/aws-sdk-go/aws"
core "k8s.io/api/core/v1"

Expand All @@ -33,7 +35,7 @@ type AL2 struct {
// SSMAlias returns the AMI Alias to query SSM
func (a AL2) SSMAlias(version string, instanceType cloudprovider.InstanceType) string {
amiSuffix := ""
if !instanceType.NvidiaGPUs().IsZero() || !instanceType.AWSNeurons().IsZero() {
if !resources.IsZero(instanceType.Resources()[v1alpha1.ResourceNVIDIAGPU]) || !resources.IsZero(instanceType.Resources()[v1alpha1.ResourceAWSNeuron]) {
amiSuffix = "-gpu"
} else if instanceType.Architecture() == v1alpha5.ArchitectureArm64 {
amiSuffix = fmt.Sprintf("-%s", instanceType.Architecture())
Expand Down
4 changes: 3 additions & 1 deletion pkg/cloudprovider/aws/amifamily/bottlerocket.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ package amifamily
import (
"fmt"

"github.com/aws/karpenter/pkg/utils/resources"

"github.com/aws/aws-sdk-go/aws"
core "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand All @@ -35,7 +37,7 @@ type Bottlerocket struct {
func (b Bottlerocket) SSMAlias(version string, instanceType cloudprovider.InstanceType) string {
arch := "x86_64"
amiSuffix := ""
if !instanceType.NvidiaGPUs().IsZero() {
if !resources.IsZero(instanceType.Resources()[v1alpha1.ResourceNVIDIAGPU]) {
amiSuffix = "-nvidia"
}
if instanceType.Architecture() == v1alpha5.ArchitectureArm64 {
Expand Down
5 changes: 5 additions & 0 deletions pkg/cloudprovider/aws/apis/v1alpha1/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package v1alpha1

import (
"github.com/aws/aws-sdk-go/service/ec2"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/runtime/serializer"
Expand All @@ -41,6 +42,10 @@ var (
AMIFamilyAL2,
AMIFamilyUbuntu,
}
ResourceNVIDIAGPU v1.ResourceName = "nvidia.com/gpu"
ResourceAMDGPU v1.ResourceName = "amd.com/gpu"
ResourceAWSNeuron v1.ResourceName = "aws.amazon.com/neuron"
ResourceAWSPodENI v1.ResourceName = "vpc.amazonaws.com/pod-eni"
)

var (
Expand Down
24 changes: 24 additions & 0 deletions pkg/cloudprovider/aws/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
"fmt"
"time"

"github.com/aws/karpenter/pkg/utils/resources"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/client"
"github.com/aws/aws-sdk-go/aws/ec2metadata"
Expand Down Expand Up @@ -104,6 +106,8 @@ func (c *CloudProvider) Create(ctx context.Context, constraints *v1alpha5.Constr
if err != nil {
return err
}
instanceTypes = c.filterInstanceTypes(instanceTypes)

// Create will only return an error if zero nodes could be launched.
// Partial fulfillment will be logged
nodes, err := c.instanceProvider.Create(ctx, vendorConstraints, instanceTypes, quantity)
Expand Down Expand Up @@ -157,6 +161,26 @@ func (c *CloudProvider) Name() string {
return "aws"
}

// filterInstanceTypes is used to eliminate GPU instance types from the list of possible instance types when a
// non-GPU instance type will work. If the list of instance types consists of both GPU and non-GPU types, then only
// the non-GPU types will be returned. If it has only GPU types, the list will be returned unaltered.
func (c *CloudProvider) filterInstanceTypes(instanceTypes []cloudprovider.InstanceType) []cloudprovider.InstanceType {
var genericInstanceTypes []cloudprovider.InstanceType
for _, it := range instanceTypes {
itRes := it.Resources()
if resources.IsZero(itRes[v1alpha1.ResourceAWSNeuron]) &&
resources.IsZero(itRes[v1alpha1.ResourceAMDGPU]) &&
resources.IsZero(itRes[v1alpha1.ResourceNVIDIAGPU]) {
genericInstanceTypes = append(genericInstanceTypes, it)
}
}
// if we got some subset of non-GPU types, then prefer to use those
if len(genericInstanceTypes) != 0 {
return genericInstanceTypes
}
return instanceTypes
}

// get the current region from EC2 IMDS
func getRegionFromIMDS(sess *session.Session) string {
region, err := ec2metadata.New(sess).Region()
Expand Down
9 changes: 7 additions & 2 deletions pkg/cloudprovider/aws/fake/ssmapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ package fake

import (
"context"
"fmt"

"github.com/mitchellh/hashstructure/v2"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/request"
Expand All @@ -29,14 +32,16 @@ type SSMAPI struct {
WantErr error
}

func (a SSMAPI) GetParameterWithContext(context.Context, *ssm.GetParameterInput, ...request.Option) (*ssm.GetParameterOutput, error) {
func (a SSMAPI) GetParameterWithContext(ctx context.Context, input *ssm.GetParameterInput, opts ...request.Option) (*ssm.GetParameterOutput, error) {
if a.WantErr != nil {
return nil, a.WantErr
}
hc, _ := hashstructure.Hash(input.Name, hashstructure.FormatV2, nil)
if a.GetParameterOutput != nil {
return a.GetParameterOutput, nil
}

return &ssm.GetParameterOutput{
Parameter: &ssm.Parameter{Value: aws.String("test-ami-id")},
Parameter: &ssm.Parameter{Value: aws.String(fmt.Sprintf("test-ami-id-%x", hc))},
}, nil
}
22 changes: 10 additions & 12 deletions pkg/cloudprovider/aws/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,7 @@ const (
CreationQPS = 2
// CreationBurst limits the additional burst requests.
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/throttling.html#throttling-limits
CreationBurst = 100
nvidiaGPUResourceName v1.ResourceName = "nvidia.com/gpu"
amdGPUResourceName v1.ResourceName = "amd.com/gpu"
awsNeuronResourceName v1.ResourceName = "aws.amazon.com/neuron"
CreationBurst = 100
)

type InstanceProvider struct {
Expand Down Expand Up @@ -276,17 +273,18 @@ func (p *InstanceProvider) instanceToNode(ctx context.Context, instance *ec2.Ins
if injection.GetOptions(ctx).GetAWSNodeNameConvention() == options.ResourceName {
nodeName = aws.StringValue(instance.InstanceId)
}

resources := v1.ResourceList{}
for resourceName, quantity := range map[v1.ResourceName]*resource.Quantity{
v1.ResourcePods: instanceType.Pods(),
v1.ResourceCPU: instanceType.CPU(),
v1.ResourceMemory: instanceType.Memory(),
nvidiaGPUResourceName: instanceType.NvidiaGPUs(),
amdGPUResourceName: instanceType.AMDGPUs(),
awsNeuronResourceName: instanceType.AWSNeurons(),
for resourceName, quantity := range map[v1.ResourceName]resource.Quantity{
v1.ResourcePods: instanceType.Resources()[v1.ResourcePods],
v1.ResourceCPU: instanceType.Resources()[v1.ResourceCPU],
v1.ResourceMemory: instanceType.Resources()[v1.ResourceMemory],
v1alpha1.ResourceNVIDIAGPU: instanceType.Resources()[v1alpha1.ResourceNVIDIAGPU],
v1alpha1.ResourceAMDGPU: instanceType.Resources()[v1alpha1.ResourceAMDGPU],
v1alpha1.ResourceAWSNeuron: instanceType.Resources()[v1alpha1.ResourceAWSNeuron],
} {
if !quantity.IsZero() {
resources[resourceName] = *quantity
resources[resourceName] = quantity
}
}

Expand Down
71 changes: 52 additions & 19 deletions pkg/cloudprovider/aws/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,66 +60,98 @@ func (i *InstanceType) Architecture() string {
return fmt.Sprint(aws.StringValueSlice(i.ProcessorInfo.SupportedArchitectures)) // Unrecognized, but used for error printing
}

func (i *InstanceType) CPU() *resource.Quantity {
return resources.Quantity(fmt.Sprint(*i.VCpuInfo.DefaultVCpus))
func (i *InstanceType) Resources() v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: i.cpu(),
v1.ResourceMemory: i.memory(),
v1.ResourcePods: i.pods(),
v1alpha1.ResourceAWSPodENI: i.awsPodENI(),
v1alpha1.ResourceNVIDIAGPU: i.nvidiaGPUs(),
v1alpha1.ResourceAMDGPU: i.amdGPUs(),
v1alpha1.ResourceAWSNeuron: i.awsNeurons(),
}
}

func (i *InstanceType) Price() float64 {
const (
GPUCostWeight = 5
CPUCostWeight = 1
MemoryMBCostWeight = 1024
)

gpuCount := 0.0
if i.GpuInfo != nil {
for _, gpu := range i.GpuInfo.Gpus {
if gpu.Count != nil {
gpuCount += float64(*gpu.Count)
}
}
}

return CPUCostWeight*float64(*i.VCpuInfo.DefaultVCpus) +
MemoryMBCostWeight*float64(*i.MemoryInfo.SizeInMiB) +
GPUCostWeight*gpuCount
}
func (i *InstanceType) cpu() resource.Quantity {
return *resources.Quantity(fmt.Sprint(*i.VCpuInfo.DefaultVCpus))
}

func (i *InstanceType) Memory() *resource.Quantity {
return resources.Quantity(
func (i *InstanceType) memory() resource.Quantity {
return *resources.Quantity(
fmt.Sprintf("%dMi", int32(
float64(*i.MemoryInfo.SizeInMiB)*EC2VMAvailableMemoryFactor,
)),
)
}

func (i *InstanceType) Pods() *resource.Quantity {
func (i *InstanceType) pods() resource.Quantity {
if i.MaxPods != nil {
return resources.Quantity(fmt.Sprint(ptr.Int32Value(i.MaxPods)))
return *resources.Quantity(fmt.Sprint(ptr.Int32Value(i.MaxPods)))
}
return resources.Quantity(fmt.Sprint(i.eniLimitedPods()))
return *resources.Quantity(fmt.Sprint(i.eniLimitedPods()))
}

func (i *InstanceType) AWSPodENI() *resource.Quantity {
func (i *InstanceType) awsPodENI() resource.Quantity {
// https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html#supported-instance-types
limits, ok := vpc.Limits[aws.StringValue(i.InstanceType)]
if ok && limits.IsTrunkingCompatible {
return resources.Quantity(fmt.Sprint(limits.BranchInterface))
return *resources.Quantity(fmt.Sprint(limits.BranchInterface))
}
return resources.Quantity("0")
return *resources.Quantity("0")
}

func (i *InstanceType) NvidiaGPUs() *resource.Quantity {
func (i *InstanceType) nvidiaGPUs() resource.Quantity {
count := int64(0)
if i.GpuInfo != nil {
for _, gpu := range i.GpuInfo.Gpus {
if *i.GpuInfo.Gpus[0].Manufacturer == "NVIDIA" {
if *gpu.Manufacturer == "NVIDIA" {
count += *gpu.Count
}
}
}
return resources.Quantity(fmt.Sprint(count))
return *resources.Quantity(fmt.Sprint(count))
}

func (i *InstanceType) AMDGPUs() *resource.Quantity {
func (i *InstanceType) amdGPUs() resource.Quantity {
count := int64(0)
if i.GpuInfo != nil {
for _, gpu := range i.GpuInfo.Gpus {
if *i.GpuInfo.Gpus[0].Manufacturer == "AMD" {
if *gpu.Manufacturer == "NVIDIA" {
count += *gpu.Count
}
}
}
return resources.Quantity(fmt.Sprint(count))
return *resources.Quantity(fmt.Sprint(count))
}

func (i *InstanceType) AWSNeurons() *resource.Quantity {
func (i *InstanceType) awsNeurons() resource.Quantity {
count := int64(0)
if i.InferenceAcceleratorInfo != nil {
for _, accelerator := range i.InferenceAcceleratorInfo.Accelerators {
count += *accelerator.Count
}
}
return resources.Quantity(fmt.Sprint(count))
return *resources.Quantity(fmt.Sprint(count))
}

// Overhead computes overhead for https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable
Expand Down Expand Up @@ -152,7 +184,8 @@ func (i *InstanceType) Overhead() v1.ResourceList {
{start: 2000, end: 4000, percentage: 0.005},
{start: 4000, end: 1 << 31, percentage: 0.0025},
} {
if cpu := i.CPU().MilliValue(); cpu >= cpuRange.start {
cpuSt := i.cpu()
if cpu := cpuSt.MilliValue(); cpu >= cpuRange.start {
r := float64(cpuRange.end - cpuRange.start)
if cpu < cpuRange.end {
r = float64(cpu - cpuRange.start)
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/aws/instancetypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func (p *InstanceTypeProvider) Get(ctx context.Context, provider *v1alpha1.AWS)
if err != nil {
return nil, err
}
result := []cloudprovider.InstanceType{}
var result []cloudprovider.InstanceType
for _, instanceType := range instanceTypes {
offerings := p.createOfferings(instanceType, subnetZones, instanceTypeZones[instanceType.Name()])
if len(offerings) > 0 {
Expand Down
Loading

0 comments on commit 3f28a21

Please sign in to comment.