Skip to content

Commit

Permalink
Adding more Metrics (#628)
Browse files Browse the repository at this point in the history
* Add aws API related metrics

* Add unit test
  • Loading branch information
liwenwu-amazon authored May 2, 2024
1 parent 086bcb0 commit 9c7a642
Show file tree
Hide file tree
Showing 6 changed files with 336 additions and 3 deletions.
4 changes: 3 additions & 1 deletion cmd/aws-application-networking-k8s/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ import (
"github.com/aws/aws-application-networking-k8s/pkg/k8s"
discoveryv1 "k8s.io/api/discovery/v1"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"

"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
Expand Down Expand Up @@ -126,7 +128,7 @@ func main() {
AccountId: config.AccountID,
Region: config.Region,
ClusterName: config.ClusterName,
})
}, metrics.Registry)
if err != nil {
setupLog.Fatal("cloud client setup failed: %s", err)
}
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/onsi/ginkgo v1.16.5
github.com/onsi/gomega v1.27.10
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.17.0
github.com/stretchr/testify v1.8.4
go.uber.org/zap v1.26.0
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
Expand Down Expand Up @@ -53,7 +54,6 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/nxadm/tail v1.4.8 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.17.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
Expand Down
13 changes: 12 additions & 1 deletion pkg/aws/cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ package aws
import (
"fmt"

"github.com/prometheus/client_golang/prometheus"

"github.com/aws/aws-sdk-go/aws/request"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/vpclattice"
"golang.org/x/exp/maps"

"context"
"github.com/aws/aws-application-networking-k8s/pkg/aws/metrics"
"github.com/aws/aws-application-networking-k8s/pkg/aws/services"
"github.com/aws/aws-application-networking-k8s/pkg/utils/gwlog"
)
Expand Down Expand Up @@ -47,7 +50,7 @@ type Cloud interface {
}

// NewCloud constructs new Cloud implementation.
func NewCloud(log gwlog.Logger, cfg CloudConfig) (Cloud, error) {
func NewCloud(log gwlog.Logger, cfg CloudConfig, metricsRegisterer prometheus.Registerer) (Cloud, error) {
sess, err := session.NewSession()
if err != nil {
return nil, err
Expand All @@ -70,6 +73,14 @@ func NewCloud(log gwlog.Logger, cfg CloudConfig) (Cloud, error) {
}
})

if metricsRegisterer != nil {
metricsCollector, err := metrics.NewCollector(metricsRegisterer)
if err != nil {
return nil, err
}
metricsCollector.InjectHandlers(&sess.Handlers)
}

lattice := services.NewDefaultLattice(sess, cfg.AccountId, cfg.Region)
tagging := services.NewDefaultTagging(sess, cfg.Region)
cl := NewDefaultCloudWithTagging(lattice, tagging, cfg)
Expand Down
110 changes: 110 additions & 0 deletions pkg/aws/metrics/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package metrics

import (
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/request"
"github.com/prometheus/client_golang/prometheus"
"strconv"
"time"
)

const (
sdkHandlerCollectAPICallMetric = "collectAPICallMetric"
sdkHandlerCollectAPIRequestMetric = "collectAPIRequestMetric"
)

type collector struct {
instruments *instruments
}

func NewCollector(registerer prometheus.Registerer) (*collector, error) {
instruments, err := newInstruments(registerer)
if err != nil {
return nil, err
}
return &collector{
instruments: instruments,
}, nil
}

func (c *collector) InjectHandlers(handlers *request.Handlers) {
handlers.CompleteAttempt.PushFrontNamed(request.NamedHandler{
Name: sdkHandlerCollectAPIRequestMetric,
Fn: c.collectAPIRequestMetric,
})
handlers.Complete.PushFrontNamed(request.NamedHandler{
Name: sdkHandlerCollectAPICallMetric,
Fn: c.collectAPICallMetric,
})
}

func (c *collector) collectAPIRequestMetric(r *request.Request) {
service := r.ClientInfo.ServiceID
operation := r.Operation.Name
statusCode := statusCodeForRequest(r)
errorCode := errorCodeForRequest(r)
duration := time.Since(r.AttemptTime)

c.instruments.apiRequestsTotal.With(map[string]string{
labelService: service,
labelOperation: operation,
labelStatusCode: statusCode,
labelErrorCode: errorCode,
}).Inc()
c.instruments.apiRequestDurationSecond.With(map[string]string{
labelService: service,
labelOperation: operation,
}).Observe(duration.Seconds())
}

func (c *collector) collectAPICallMetric(r *request.Request) {
service := r.ClientInfo.ServiceID
operation := r.Operation.Name
statusCode := statusCodeForRequest(r)
errorCode := errorCodeForRequest(r)
duration := time.Since(r.Time)

c.instruments.apiCallsTotal.With(map[string]string{
labelService: service,
labelOperation: operation,
labelStatusCode: statusCode,
labelErrorCode: errorCode,
}).Inc()
c.instruments.apiCallDurationSeconds.With(map[string]string{
labelService: service,
labelOperation: operation,
}).Observe(duration.Seconds())
c.instruments.apiCallRetries.With(map[string]string{
labelService: service,
labelOperation: operation,
}).Observe(float64(r.RetryCount))
}

// statusCodeForRequest returns the http status code for request.
// if there is no http response, returns "0".
func statusCodeForRequest(r *request.Request) string {
if r.HTTPResponse != nil {
return strconv.Itoa(r.HTTPResponse.StatusCode)
}
return "0"
}

// errorCodeForRequest returns the error code for request.
// if no error happened, returns "".
func errorCodeForRequest(r *request.Request) string {
if r.Error != nil {
if awserr, ok := r.Error.(awserr.Error); ok {
return awserr.Code()
}
return "internal"
}
return ""
}

// operationForRequest returns the operation for request.
func operationForRequest(r *request.Request) string {
if r.Operation != nil {
return r.Operation.Name
}
return "?"
}
125 changes: 125 additions & 0 deletions pkg/aws/metrics/collector_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package metrics

import (
"errors"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/request"
"github.com/stretchr/testify/assert"
"net/http"
"testing"
)

func Test_statusCodeForRequest(t *testing.T) {
type args struct {
r *request.Request
}
tests := []struct {
name string
args args
want string
}{
{
name: "requests without http response",
args: args{
r: &request.Request{},
},
want: "0",
},
{
name: "requests with http response",
args: args{
r: &request.Request{
HTTPResponse: &http.Response{
StatusCode: 200,
},
},
},
want: "200",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := statusCodeForRequest(tt.args.r)
assert.Equal(t, tt.want, got)
})
}
}

func Test_errorCodeForRequest(t *testing.T) {
type args struct {
r *request.Request
}
tests := []struct {
name string
args args
want string
}{
{
name: "requests without error",
args: args{
r: &request.Request{},
},
want: "",
},
{
name: "requests with internal error",
args: args{
r: &request.Request{
Error: errors.New("oops, some internal error"),
},
},
want: "internal",
},
{
name: "requests with aws error",
args: args{
r: &request.Request{
Error: awserr.New("NotFoundException", "", nil),
},
},
want: "NotFoundException",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := errorCodeForRequest(tt.args.r)
assert.Equal(t, tt.want, got)
})
}
}

func Test_operationForRequest(t *testing.T) {
type args struct {
r *request.Request
}
tests := []struct {
name string
args args
want string
}{
{
name: "requests without operation",
args: args{
r: &request.Request{},
},
want: "?",
},
{
name: "requests with operation",
args: args{
r: &request.Request{
Operation: &request.Operation{
Name: "DescribeMesh",
},
},
},
want: "DescribeMesh",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := operationForRequest(tt.args.r)
assert.Equal(t, tt.want, got)
})
}
}
85 changes: 85 additions & 0 deletions pkg/aws/metrics/instruments.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package metrics

import (
"github.com/prometheus/client_golang/prometheus"
)

const (
metricSubsystemAWS = "aws"

metricAPICallsTotal = "api_calls_total"
metricAPICallDurationSeconds = "api_call_duration_seconds"
metricAPICallRetries = "api_call_retries"

metricAPIRequestsTotal = "api_requests_total"
metricAPIRequestDurationSeconds = "api_request_duration_seconds"
)

const (
labelService = "service"
labelOperation = "operation"
labelStatusCode = "status_code"
labelErrorCode = "error_code"
)

type instruments struct {
apiCallsTotal *prometheus.CounterVec
apiCallDurationSeconds *prometheus.HistogramVec
apiCallRetries *prometheus.HistogramVec
apiRequestsTotal *prometheus.CounterVec
apiRequestDurationSecond *prometheus.HistogramVec
}

// newInstruments allocates and register new metrics to registerer
func newInstruments(registerer prometheus.Registerer) (*instruments, error) {
apiCallsTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: metricSubsystemAWS,
Name: metricAPICallsTotal,
Help: "Total number of SDK API calls from the customer's code to AWS services",
}, []string{labelService, labelOperation, labelStatusCode, labelErrorCode})
apiCallDurationSeconds := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: metricSubsystemAWS,
Name: metricAPICallDurationSeconds,
Help: "Perceived latency from when your code makes an SDK call, includes retries",
}, []string{labelService, labelOperation})
apiCallRetries := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: metricSubsystemAWS,
Name: metricAPICallRetries,
Help: "Number of times the SDK retried requests to AWS services for SDK API calls",
Buckets: []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
}, []string{labelService, labelOperation})

apiRequestsTotal := prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: metricSubsystemAWS,
Name: metricAPIRequestsTotal,
Help: "Total number of HTTP requests that the SDK made",
}, []string{labelService, labelOperation, labelStatusCode, labelErrorCode})
apiRequestDurationSecond := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: metricSubsystemAWS,
Name: metricAPIRequestDurationSeconds,
Help: "Latency of an individual HTTP request to the service endpoint",
}, []string{labelService, labelOperation})

if err := registerer.Register(apiCallsTotal); err != nil {
return nil, err
}
if err := registerer.Register(apiCallDurationSeconds); err != nil {
return nil, err
}
if err := registerer.Register(apiCallRetries); err != nil {
return nil, err
}
if err := registerer.Register(apiRequestsTotal); err != nil {
return nil, err
}
if err := registerer.Register(apiRequestDurationSecond); err != nil {
return nil, err
}
return &instruments{
apiCallsTotal: apiCallsTotal,
apiCallDurationSeconds: apiCallDurationSeconds,
apiCallRetries: apiCallRetries,
apiRequestsTotal: apiRequestsTotal,
apiRequestDurationSecond: apiRequestDurationSecond,
}, nil
}

0 comments on commit 9c7a642

Please sign in to comment.