Get rid of separate DumbSpreading function and just treat zero-limit

pods as having a constant non-zero memory and CPU limit.
moonboots · Jul 5, 2015 · 4ea8b8a · 4ea8b8a
1 parent 44ed229
commit 4ea8b8a
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 96 deletions.
diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities.go b/plugin/pkg/scheduler/algorithm/priorities/priorities.go
@@ -21,49 +21,80 @@ import (
 
  "github.com/GoogleCloudPlatform/kubernetes/pkg/api"
  "github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
+ "github.com/GoogleCloudPlatform/kubernetes/pkg/api/resource"
  "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm"
  "github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm/predicates"
  "github.com/golang/glog"
 )
 
 // the unused capacity is calculated on a scale of 0-10
 // 0 being the lowest priority and 10 being the highest
-func calculateScore(requested, capacity int64, node string) int {
+func calculateScore(requested int64, capacity int64, node string) int {
  if capacity == 0 {
  return 0
  }
  if requested > capacity {
- glog.Infof("Combined requested resources from existing pods exceeds capacity on minion: %s", node)
+ glog.Infof("Combined requested resources %d from existing pods exceeds capacity %d on minion: %s",
+ requested, capacity, node)
  return 0
  }
  return int(((capacity - requested) * 10) / capacity)
 }
 
+// For each of these resources, a pod that doesn't request the resource explicitly
+// will be treated as having requested the amount indicated below, for the purpose
+// of computing priority only. This ensures that when scheduling zero-limit pods, such
+// pods will not all be scheduled to the machine with the smallest in-use limit,
+// and that when scheduling regular pods, such pods will not see zero-limit pods as
+// consuming no resources whatsoever.
+const defaultMilliCpuLimit int64 = 100 // 0.1 core
+const defaultMemoryLimit int64 = 60 * 1024 * 1024 // 60 MB
+
+// TODO: Consider setting default as a fixed fraction of machine capacity (take "capacity api.ResourceList"
+// as an additional argument here) rather than using constants
+func toNonzeroLimits(limits *api.ResourceList) (int64, int64) {
+ var out_millicpu, out_memory int64
+ // Override if un-set, but not if explicitly set to zero
+ if (*limits.Cpu() == resource.Quantity{}) {
+ out_millicpu = defaultMilliCpuLimit
+ } else {
+ out_millicpu = limits.Cpu().MilliValue()
+ }
+ // Override if un-set, but not if explicitly set to zero
+ if (*limits.Memory() == resource.Quantity{}) {
+ out_memory = defaultMemoryLimit
+ } else {
+ out_memory = limits.Memory().Value()
+ }
+ return out_millicpu, out_memory
+}
+
 // Calculate the resource occupancy on a node. 'node' has information about the resources on the node.
 // 'pods' is a list of pods currently scheduled on the node.
 func calculateResourceOccupancy(pod *api.Pod, node api.Node, pods []*api.Pod) algorithm.HostPriority {
  totalMilliCPU := int64(0)
  totalMemory := int64(0)
+ capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()
+ capacityMemory := node.Status.Capacity.Memory().Value()
+
  for _, existingPod := range pods {
  for _, container := range existingPod.Spec.Containers {
- totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
- totalMemory += container.Resources.Limits.Memory().Value()
+ cpu, memory := toNonzeroLimits(&container.Resources.Limits)
+ totalMilliCPU += cpu
+ totalMemory += memory
  }
  }
  // Add the resources requested by the current pod being scheduled.
  // This also helps differentiate between differently sized, but empty, minions.
  for _, container := range pod.Spec.Containers {
- totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
- totalMemory += container.Resources.Limits.Memory().Value()
+ cpu, memory := toNonzeroLimits(&container.Resources.Limits)
+ totalMilliCPU += cpu
+ totalMemory += memory
  }
 
- capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()
- capacityMemory := node.Status.Capacity.Memory().Value()
-
  cpuScore := calculateScore(totalMilliCPU, capacityMilliCPU, node.Name)
  memoryScore := calculateScore(totalMemory, capacityMemory, node.Name)
-// glog.V(10).Infof(
- glog.Infof(
+ glog.V(10).Infof(
  "%v -> %v: Least Requested Priority, Absolute/Requested: (%d, %d) / (%d, %d) Score: (%d, %d)",
  pod.Name, node.Name,
  totalMilliCPU, totalMemory,
@@ -95,47 +126,6 @@ func LeastRequestedPriority(pod *api.Pod, podLister algorithm.PodLister, minionL
  return list, nil
 }
 
-func min(l, r int64) (m int64) {
- m = r
- if l < r {
- m = l
- }
- return m
-}
-
-// See comment for DumbSpreadingPriority()
-const dumbSpreadingDenominator int64 = 10
-
-// DumbSpreadingPriority is a priority function that favors nodes with fewer pods.
-// It works like LeastRequestedPeriority but instead of using 10 * percentage of machine free by resource,
-// it uses 10 * percentage of machine free by pod, with "percentage of machine free by pod" claculated as
-// (dumbSpreadingDenominator - number of pods already on the node + 1) / dumbSpreadingDenominator.
-// dumbSpreadingDenominator serves like the machine capacity in LeasRequestedPriority but is chosen
-// so that we equate one pod with a reasonable amount of resources when we combine all the scores together.
-func DumbSpreadingPriority(pod *api.Pod, podLister algorithm.PodLister, minionLister algorithm.MinionLister) (algorithm.HostPriorityList, error) {
- nodes, err := minionLister.List()
- if err != nil {
- return algorithm.HostPriorityList{}, err
- }
- podsToMachines, err := predicates.MapPodsToMachines(podLister)
-
- list := algorithm.HostPriorityList{}
- for _, node := range nodes.Items {
- npods := int64(len(podsToMachines[node.Name]))
- score := calculateScore(min(npods+1, dumbSpreadingDenominator), dumbSpreadingDenominator, node.Name)
-// glog.V(10).Infof(
- glog.Infof(
- "%v -> %v: DumbSpreadPriority, Old # pods (%d) Score: (%d)",
- pod.Name, node.Name, npods, score,
- )
- list = append(list, algorithm.HostPriority{
- Host: node.Name,
- Score: score,
- })
- }
- return list, nil
-}
-
 type NodeLabelPrioritizer struct {
  label string
  presence bool
@@ -205,15 +195,17 @@ func calculateBalancedResourceAllocation(pod *api.Pod, node api.Node, pods []*ap
  score := int(0)
  for _, existingPod := range pods {
  for _, container := range existingPod.Spec.Containers {
- totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
- totalMemory += container.Resources.Limits.Memory().Value()
+ cpu, memory := toNonzeroLimits(&container.Resources.Limits)
+ totalMilliCPU += cpu
+ totalMemory += memory
  }
  }
  // Add the resources requested by the current pod being scheduled.
  // This also helps differentiate between differently sized, but empty, minions.
  for _, container := range pod.Spec.Containers {
- totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
- totalMemory += container.Resources.Limits.Memory().Value()
+ cpu, memory := toNonzeroLimits(&container.Resources.Limits)
+ totalMilliCPU += cpu
+ totalMemory += memory
  }
 
  capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()
@@ -232,8 +224,7 @@ func calculateBalancedResourceAllocation(pod *api.Pod, node api.Node, pods []*ap
  diff := math.Abs(cpuFraction - memoryFraction)
  score = int(10 - diff*10)
  }
-// glog.V(10).Infof(
- glog.Infof(
+ glog.V(10).Infof(
  "%v -> %v: Balanced Resource Allocation, Absolute/Requested: (%d, %d) / (%d, %d) Score: (%d)",
  pod.Name, node.Name,
  totalMilliCPU, totalMemory,

diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go b/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go
@@ -19,6 +19,7 @@ package priorities
 import (
  "reflect"
  "sort"
+ "strconv"
  "testing"
 
  "github.com/GoogleCloudPlatform/kubernetes/pkg/api"
@@ -39,86 +40,99 @@ func makeMinion(node string, milliCPU, memory int64) api.Node {
  }
 }
 
-func TestDumbSpreading(t *testing.T) {
+func TestZeroLimit(t *testing.T) {
+ // A pod with no resources. We expect spreading to count it as having the default resources.
  noResources := api.PodSpec{
- Containers: []api.Container{},
+ Containers: []api.Container{
+ {},
+ },
  }
+ noResources1 := noResources
+ noResources1.NodeName = "machine1"
+ // A pod with the same resources as a 0-limit pod gets by default as its resources (for spreading).
  small := api.PodSpec{
- NodeName: "machine1",
  Containers: []api.Container{
  {
  Resources: api.ResourceRequirements{
  Limits: api.ResourceList{
- "cpu": resource.MustParse("100m"),
- "memory": resource.MustParse("1000"),
+ "cpu": resource.MustParse(
+ strconv.FormatInt(defaultMilliCpuLimit, 10) + "m"),
+ "memory": resource.MustParse(
+ strconv.FormatInt(defaultMemoryLimit, 10)),
  },
  },
  },
  },
  }
+ small2 := small
+ small2.NodeName = "machine2"
+ // A larger pod.
  large := api.PodSpec{
- NodeName: "machine2",
  Containers: []api.Container{
  {
  Resources: api.ResourceRequirements{
  Limits: api.ResourceList{
- "cpu": resource.MustParse("600m"),
- "memory": resource.MustParse("6000"),
+ "cpu": resource.MustParse(
+ strconv.FormatInt(defaultMilliCpuLimit * 3, 10) + "m"),
+ "memory": resource.MustParse(
+ strconv.FormatInt(defaultMemoryLimit * 3, 10)),
  },
  },
  },
  },
  }
+ large1 := large
+ large1.NodeName = "machine1"
+ large2 := large
+ large2.NodeName = "machine2"
  tests := []struct {
  pod *api.Pod
  pods []*api.Pod
  nodes []api.Node
- expectedList algorithm.HostPriorityList
  test string
  }{
+ // The point of these tests is to show you get the same priority for a zero-limit pod
+ // as for a pod with the defaults limits, both when the zero-limit pod is already on the machine
+ // and when the zero-limit pod is the one being scheduled.
  {
- /* Minion1 CPU capacity 1000m, free 700m/7000, 3 pods
- LeastRequestedPriority score 7
- BalancedResourceAllocation score 10
- ServiceSpreadingPriority score 10
- DumbSpreadingPriority score 6
- Total: 7 + 10 + 10 + 2*6 = 39
-
- Minion2 CPU capacity 1000m, free 400m/4000, 1 pod
- LeastRequestedPriority score 4
- BalancedResourceAllocation score 10
- ServiceSpreadingPriority score 10 
- DumbSpreadingPriority score 8
- Total: 4 + 10 + 10 + 2*8 = 40
-
- Moral of the story: We prefer the machine that is more heavily loaded,
- because it has fewer pods.
- */
  pod: &api.Pod{Spec: noResources},
- nodes: []api.Node{makeMinion("machine1", 1000, 10000), makeMinion("machine2", 1000, 10000)},
- expectedList: []algorithm.HostPriority{{"machine1", 39}, {"machine2", 40}},
- test: "nothing scheduled, nothing requested",
+ // match current f1-micro on GCE
+ nodes: []api.Node{makeMinion("machine1", 1000, defaultMemoryLimit * 10), makeMinion("machine2", 1000, defaultMemoryLimit * 10)},
+ test: "test priority of zero-limit pod with machine with zero-limit pod",
  pods: []*api.Pod {
- {Spec: small}, {Spec: small}, {Spec: small},
- {Spec: large},
+ {Spec: large1}, {Spec: noResources1},
+ {Spec: large2}, {Spec: small2},
+ },
+ },
+ {
+ pod: &api.Pod{Spec: small},
+ // match current f1-micro on GCE
+ nodes: []api.Node{makeMinion("machine1", 1000, defaultMemoryLimit * 10), makeMinion("machine2", 1000, defaultMemoryLimit * 10)},
+ test: "test priority of nonzero-limit pod with machine with zero-limit pod",
+ pods: []*api.Pod {
+ {Spec: large1}, {Spec: noResources1},
+ {Spec: large2}, {Spec: small2},
  },
  },
  }
 
+ const expectedPriority int = 25
  for _, test := range tests {
  list, err := scheduler.PrioritizeNodes(
  test.pod,
  algorithm.FakePodLister(test.pods),
  // This should match the configuration in defaultPriorities() in
  // plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go if you want
  // to test what's actually in production.
- []algorithm.PriorityConfig{{Function: LeastRequestedPriority, Weight: 1}, {Function: BalancedResourceAllocation, Weight: 1}, {Function: DumbSpreadingPriority, Weight: 2}, {Function: NewServiceSpreadPriority(algorithm.FakeServiceLister([]api.Service{})), Weight: 1}},
+ []algorithm.PriorityConfig{{Function: LeastRequestedPriority, Weight: 1}, {Function: BalancedResourceAllocation, Weight: 1}, {Function: NewServiceSpreadPriority(algorithm.FakeServiceLister([]api.Service{})), Weight: 1}},
  algorithm.FakeMinionLister(api.NodeList{Items: test.nodes}))
  if err != nil {
  t.Errorf("unexpected error: %v", err)
  }
- if !reflect.DeepEqual(test.expectedList, list) {
- t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
+ for _, hp := range list {
+ if (hp.Score != expectedPriority) {
+ t.Errorf("%s: expected 25 for all priorities, got list %#v", list)
+ }
  }
  }
 }
@@ -149,13 +163,15 @@ func TestLeastRequested(t *testing.T) {
  Resources: api.ResourceRequirements{
  Limits: api.ResourceList{
  "cpu": resource.MustParse("1000m"),
+ "memory": resource.MustParse("0"),
  },
  },
  },
  {
  Resources: api.ResourceRequirements{
  Limits: api.ResourceList{
  "cpu": resource.MustParse("2000m"),
+ "memory": resource.MustParse("0"),
  },
  },
  },
@@ -479,13 +495,15 @@ func TestBalancedResourceAllocation(t *testing.T) {
  Resources: api.ResourceRequirements{
  Limits: api.ResourceList{
  "cpu": resource.MustParse("1000m"),
+ "memory": resource.MustParse("0"),
  },
  },
  },
  {
  Resources: api.ResourceRequirements{
  Limits: api.ResourceList{
  "cpu": resource.MustParse("2000m"),
+ "memory": resource.MustParse("0"),
  },
  },
  },

diff --git a/plugin/pkg/scheduler/algorithm/priorities/service_spreading.go b/plugin/pkg/scheduler/algorithm/priorities/service_spreading.go
@@ -83,8 +83,7 @@ func (s *ServiceSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorith
  fScore = 10 * (float32(maxCount-counts[minion.Name]) / float32(maxCount))
  }
  result = append(result, algorithm.HostPriority{Host: minion.Name, Score: int(fScore)})
- // glog.V(10).Infof(
- glog.Infof(
+ glog.V(10).Infof(
  "%v -> %v: ServiceSpreadPriority, Sore: (%d)", pod.Name, minion.Name, int(fScore),
  )
  }

diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go
@@ -65,8 +65,6 @@ func defaultPriorities() util.StringSet {
  factory.RegisterPriorityFunction("LeastRequestedPriority", priorities.LeastRequestedPriority, 1),
  // Prioritizes nodes to help achieve balanced resource usage
  factory.RegisterPriorityFunction("BalancedResourceAllocation", priorities.BalancedResourceAllocation, 1),
- // Prioritizes nodes to achieve approximately equal number of pods per node
- factory.RegisterPriorityFunction("DumbSpreadingPriority", priorities.DumbSpreadingPriority, 2),
  // spreads pods by minimizing the number of pods (belonging to the same service) on the same minion.
  factory.RegisterPriorityConfigFactory(
  "ServiceSpreadingPriority",