Skip to content

Commit

Permalink
[cluster-autoscaler-release-1.28] Dynamic listing of SKUs (#7257)
Browse files Browse the repository at this point in the history
* Dynamic listing of SKUs (master)

* re-add enableDynamicInstanceList

* update README.md

---------

Co-authored-by: Rachel Gregory <[email protected]>
  • Loading branch information
k8s-infra-cherrypick-robot and rakechill authored Sep 6, 2024
1 parent ba3cafd commit 8e2828f
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 29 deletions.
60 changes: 32 additions & 28 deletions cluster-autoscaler/cloudprovider/azure/azure_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package azure

import (
"context"
"errors"
"reflect"
"regexp"
"strings"
Expand Down Expand Up @@ -96,7 +97,7 @@ type azureCache struct {
unownedInstances map[azureRef]bool

autoscalingOptions map[azureRef]map[string]string
skus map[string]*skewer.Cache
skus *skewer.Cache
}

func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*azureCache, error) {
Expand All @@ -113,17 +114,19 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*az
instanceToNodeGroup: make(map[azureRef]cloudprovider.NodeGroup),
unownedInstances: make(map[azureRef]bool),
autoscalingOptions: make(map[azureRef]map[string]string),
skus: make(map[string]*skewer.Cache),
}

if config.EnableDynamicInstanceList {
cache.skus[config.Location] = &skewer.Cache{}
skus: &skewer.Cache{}, // populated iff config.EnableDynamicInstanceList
}

if err := cache.regenerate(); err != nil {
klog.Errorf("Error while regenerating Azure cache: %v", err)
}

if config.EnableDynamicInstanceList {
if err := cache.fetchSKUCache(config.Location); err != nil {
klog.Errorf("Error while populating SKU list: %v", err)
}
}

return cache, nil
}

Expand Down Expand Up @@ -186,28 +189,30 @@ func (m *azureCache) regenerate() error {
newAutoscalingOptions[ref] = options
}

newSkuCache := make(map[string]*skewer.Cache)
for location := range m.skus {
cache, err := m.fetchSKUs(context.Background(), location)
if err != nil {
return err
}
newSkuCache[location] = cache
}

m.mutex.Lock()
defer m.mutex.Unlock()

m.instanceToNodeGroup = newInstanceToNodeGroupCache
m.autoscalingOptions = newAutoscalingOptions
m.skus = newSkuCache

// Reset unowned instances cache.
m.unownedInstances = make(map[azureRef]bool)

return nil
}

func (m *azureCache) fetchSKUCache(location string) error {
m.mutex.Lock()
defer m.mutex.Unlock()

cache, err := m.fetchSKUs(context.Background(), location)
if err != nil {
return err
}
m.skus = cache
return nil
}

// fetchAzureResources retrieves and updates the cached Azure resources.
//
// This function performs the following:
Expand Down Expand Up @@ -359,27 +364,26 @@ func (m *azureCache) Unregister(nodeGroup cloudprovider.NodeGroup) bool {
}

func (m *azureCache) fetchSKUs(ctx context.Context, location string) (*skewer.Cache, error) {
if location == "" {
return nil, errors.New("location not specified")
}

return skewer.NewCache(ctx,
skewer.WithLocation(location),
skewer.WithResourceClient(m.azClient.skuClient),
)
}

// HasVMSKUS returns true if the cache has any VM SKUs. Can be used to judge success of loading.
func (m *azureCache) HasVMSKUs() bool {
// not nil or empty (using the only exposed semi-efficient way to check)
return !(m.skus == nil || m.skus.Equal(&skewer.Cache{}))
}

func (m *azureCache) GetSKU(ctx context.Context, skuName, location string) (skewer.SKU, error) {
m.mutex.Lock()
defer m.mutex.Unlock()

cache, ok := m.skus[location]
if !ok {
var err error
cache, err = m.fetchSKUs(ctx, location)
if err != nil {
klog.V(1).Infof("Failed to instantiate cache, err: %v", err)
return skewer.SKU{}, err
}
m.skus[location] = cache
}

cache := m.skus
return cache.Get(ctx, skuName, skewer.VirtualMachines, location)
}

Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ type Config struct {
// EnableForceDelete defines whether to enable force deletion on the APIs
EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"`

// EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check
// (DEPRECATED, DO NOT USE) EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check
EnableDynamicInstanceList bool `json:"enableDynamicInstanceList,omitempty" yaml:"enableDynamicInstanceList,omitempty"`

// (DEPRECATED, DO NOT USE) EnableDetailedCSEMessage defines whether to emit error messages in the CSE error body info
Expand Down
6 changes: 6 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi
}
manager.azureCache = cache

if !manager.azureCache.HasVMSKUs() {
klog.Warning("No VM SKU info loaded, using only static SKU list")
cfg.EnableDynamicInstanceList = false
}

specs, err := ParseLabelAutoDiscoverySpecs(discoveryOpts)
if err != nil {
return nil, err
Expand All @@ -130,6 +135,7 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi
Cap: 10 * time.Minute,
}

// skuCache will already be created at this step by newAzureCache()
err = kretry.OnError(retryBackoff, retry.IsErrorRetriable, func() (err error) {
return manager.forceRefresh()
})
Expand Down

0 comments on commit 8e2828f

Please sign in to comment.