diff --git a/cmd/aws-application-networking-k8s/main.go b/cmd/aws-application-networking-k8s/main.go index 744da27a..cc4abdf3 100644 --- a/cmd/aws-application-networking-k8s/main.go +++ b/cmd/aws-application-networking-k8s/main.go @@ -52,6 +52,8 @@ import ( "github.com/aws/aws-application-networking-k8s/pkg/k8s" discoveryv1 "k8s.io/api/discovery/v1" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + "sigs.k8s.io/controller-runtime/pkg/metrics" ) var ( @@ -126,7 +128,7 @@ func main() { AccountId: config.AccountID, Region: config.Region, ClusterName: config.ClusterName, - }) + }, metrics.Registry) if err != nil { setupLog.Fatal("cloud client setup failed: %s", err) } diff --git a/go.mod b/go.mod index 5d506c63..6b00ec2b 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/onsi/ginkgo v1.16.5 github.com/onsi/gomega v1.27.10 github.com/pkg/errors v0.9.1 + github.com/prometheus/client_golang v1.17.0 github.com/stretchr/testify v1.8.4 go.uber.org/zap v1.26.0 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa @@ -53,7 +54,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/nxadm/tail v1.4.8 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_golang v1.17.0 // indirect github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect diff --git a/pkg/aws/cloud.go b/pkg/aws/cloud.go index aae7de93..77f61a35 100644 --- a/pkg/aws/cloud.go +++ b/pkg/aws/cloud.go @@ -3,12 +3,15 @@ package aws import ( "fmt" + "github.com/prometheus/client_golang/prometheus" + "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/vpclattice" "golang.org/x/exp/maps" "context" + "github.com/aws/aws-application-networking-k8s/pkg/aws/metrics" "github.com/aws/aws-application-networking-k8s/pkg/aws/services" "github.com/aws/aws-application-networking-k8s/pkg/utils/gwlog" ) @@ -47,7 +50,7 @@ type Cloud interface { } // NewCloud constructs new Cloud implementation. -func NewCloud(log gwlog.Logger, cfg CloudConfig) (Cloud, error) { +func NewCloud(log gwlog.Logger, cfg CloudConfig, metricsRegisterer prometheus.Registerer) (Cloud, error) { sess, err := session.NewSession() if err != nil { return nil, err @@ -70,6 +73,14 @@ func NewCloud(log gwlog.Logger, cfg CloudConfig) (Cloud, error) { } }) + if metricsRegisterer != nil { + metricsCollector, err := metrics.NewCollector(metricsRegisterer) + if err != nil { + return nil, err + } + metricsCollector.InjectHandlers(&sess.Handlers) + } + lattice := services.NewDefaultLattice(sess, cfg.AccountId, cfg.Region) tagging := services.NewDefaultTagging(sess, cfg.Region) cl := NewDefaultCloudWithTagging(lattice, tagging, cfg) diff --git a/pkg/aws/metrics/collector.go b/pkg/aws/metrics/collector.go new file mode 100644 index 00000000..0a6db8ca --- /dev/null +++ b/pkg/aws/metrics/collector.go @@ -0,0 +1,110 @@ +package metrics + +import ( + "github.com/aws/aws-sdk-go/aws/awserr" + "github.com/aws/aws-sdk-go/aws/request" + "github.com/prometheus/client_golang/prometheus" + "strconv" + "time" +) + +const ( + sdkHandlerCollectAPICallMetric = "collectAPICallMetric" + sdkHandlerCollectAPIRequestMetric = "collectAPIRequestMetric" +) + +type collector struct { + instruments *instruments +} + +func NewCollector(registerer prometheus.Registerer) (*collector, error) { + instruments, err := newInstruments(registerer) + if err != nil { + return nil, err + } + return &collector{ + instruments: instruments, + }, nil +} + +func (c *collector) InjectHandlers(handlers *request.Handlers) { + handlers.CompleteAttempt.PushFrontNamed(request.NamedHandler{ + Name: sdkHandlerCollectAPIRequestMetric, + Fn: c.collectAPIRequestMetric, + }) + handlers.Complete.PushFrontNamed(request.NamedHandler{ + Name: sdkHandlerCollectAPICallMetric, + Fn: c.collectAPICallMetric, + }) +} + +func (c *collector) collectAPIRequestMetric(r *request.Request) { + service := r.ClientInfo.ServiceID + operation := r.Operation.Name + statusCode := statusCodeForRequest(r) + errorCode := errorCodeForRequest(r) + duration := time.Since(r.AttemptTime) + + c.instruments.apiRequestsTotal.With(map[string]string{ + labelService: service, + labelOperation: operation, + labelStatusCode: statusCode, + labelErrorCode: errorCode, + }).Inc() + c.instruments.apiRequestDurationSecond.With(map[string]string{ + labelService: service, + labelOperation: operation, + }).Observe(duration.Seconds()) +} + +func (c *collector) collectAPICallMetric(r *request.Request) { + service := r.ClientInfo.ServiceID + operation := r.Operation.Name + statusCode := statusCodeForRequest(r) + errorCode := errorCodeForRequest(r) + duration := time.Since(r.Time) + + c.instruments.apiCallsTotal.With(map[string]string{ + labelService: service, + labelOperation: operation, + labelStatusCode: statusCode, + labelErrorCode: errorCode, + }).Inc() + c.instruments.apiCallDurationSeconds.With(map[string]string{ + labelService: service, + labelOperation: operation, + }).Observe(duration.Seconds()) + c.instruments.apiCallRetries.With(map[string]string{ + labelService: service, + labelOperation: operation, + }).Observe(float64(r.RetryCount)) +} + +// statusCodeForRequest returns the http status code for request. +// if there is no http response, returns "0". +func statusCodeForRequest(r *request.Request) string { + if r.HTTPResponse != nil { + return strconv.Itoa(r.HTTPResponse.StatusCode) + } + return "0" +} + +// errorCodeForRequest returns the error code for request. +// if no error happened, returns "". +func errorCodeForRequest(r *request.Request) string { + if r.Error != nil { + if awserr, ok := r.Error.(awserr.Error); ok { + return awserr.Code() + } + return "internal" + } + return "" +} + +// operationForRequest returns the operation for request. +func operationForRequest(r *request.Request) string { + if r.Operation != nil { + return r.Operation.Name + } + return "?" +} diff --git a/pkg/aws/metrics/collector_test.go b/pkg/aws/metrics/collector_test.go new file mode 100644 index 00000000..921a18ba --- /dev/null +++ b/pkg/aws/metrics/collector_test.go @@ -0,0 +1,125 @@ +package metrics + +import ( + "errors" + "github.com/aws/aws-sdk-go/aws/awserr" + "github.com/aws/aws-sdk-go/aws/request" + "github.com/stretchr/testify/assert" + "net/http" + "testing" +) + +func Test_statusCodeForRequest(t *testing.T) { + type args struct { + r *request.Request + } + tests := []struct { + name string + args args + want string + }{ + { + name: "requests without http response", + args: args{ + r: &request.Request{}, + }, + want: "0", + }, + { + name: "requests with http response", + args: args{ + r: &request.Request{ + HTTPResponse: &http.Response{ + StatusCode: 200, + }, + }, + }, + want: "200", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := statusCodeForRequest(tt.args.r) + assert.Equal(t, tt.want, got) + }) + } +} + +func Test_errorCodeForRequest(t *testing.T) { + type args struct { + r *request.Request + } + tests := []struct { + name string + args args + want string + }{ + { + name: "requests without error", + args: args{ + r: &request.Request{}, + }, + want: "", + }, + { + name: "requests with internal error", + args: args{ + r: &request.Request{ + Error: errors.New("oops, some internal error"), + }, + }, + want: "internal", + }, + { + name: "requests with aws error", + args: args{ + r: &request.Request{ + Error: awserr.New("NotFoundException", "", nil), + }, + }, + want: "NotFoundException", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := errorCodeForRequest(tt.args.r) + assert.Equal(t, tt.want, got) + }) + } +} + +func Test_operationForRequest(t *testing.T) { + type args struct { + r *request.Request + } + tests := []struct { + name string + args args + want string + }{ + { + name: "requests without operation", + args: args{ + r: &request.Request{}, + }, + want: "?", + }, + { + name: "requests with operation", + args: args{ + r: &request.Request{ + Operation: &request.Operation{ + Name: "DescribeMesh", + }, + }, + }, + want: "DescribeMesh", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := operationForRequest(tt.args.r) + assert.Equal(t, tt.want, got) + }) + } +} diff --git a/pkg/aws/metrics/instruments.go b/pkg/aws/metrics/instruments.go new file mode 100644 index 00000000..e3ca8120 --- /dev/null +++ b/pkg/aws/metrics/instruments.go @@ -0,0 +1,85 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +const ( + metricSubsystemAWS = "aws" + + metricAPICallsTotal = "api_calls_total" + metricAPICallDurationSeconds = "api_call_duration_seconds" + metricAPICallRetries = "api_call_retries" + + metricAPIRequestsTotal = "api_requests_total" + metricAPIRequestDurationSeconds = "api_request_duration_seconds" +) + +const ( + labelService = "service" + labelOperation = "operation" + labelStatusCode = "status_code" + labelErrorCode = "error_code" +) + +type instruments struct { + apiCallsTotal *prometheus.CounterVec + apiCallDurationSeconds *prometheus.HistogramVec + apiCallRetries *prometheus.HistogramVec + apiRequestsTotal *prometheus.CounterVec + apiRequestDurationSecond *prometheus.HistogramVec +} + +// newInstruments allocates and register new metrics to registerer +func newInstruments(registerer prometheus.Registerer) (*instruments, error) { + apiCallsTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: metricSubsystemAWS, + Name: metricAPICallsTotal, + Help: "Total number of SDK API calls from the customer's code to AWS services", + }, []string{labelService, labelOperation, labelStatusCode, labelErrorCode}) + apiCallDurationSeconds := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: metricSubsystemAWS, + Name: metricAPICallDurationSeconds, + Help: "Perceived latency from when your code makes an SDK call, includes retries", + }, []string{labelService, labelOperation}) + apiCallRetries := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: metricSubsystemAWS, + Name: metricAPICallRetries, + Help: "Number of times the SDK retried requests to AWS services for SDK API calls", + Buckets: []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + }, []string{labelService, labelOperation}) + + apiRequestsTotal := prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: metricSubsystemAWS, + Name: metricAPIRequestsTotal, + Help: "Total number of HTTP requests that the SDK made", + }, []string{labelService, labelOperation, labelStatusCode, labelErrorCode}) + apiRequestDurationSecond := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: metricSubsystemAWS, + Name: metricAPIRequestDurationSeconds, + Help: "Latency of an individual HTTP request to the service endpoint", + }, []string{labelService, labelOperation}) + + if err := registerer.Register(apiCallsTotal); err != nil { + return nil, err + } + if err := registerer.Register(apiCallDurationSeconds); err != nil { + return nil, err + } + if err := registerer.Register(apiCallRetries); err != nil { + return nil, err + } + if err := registerer.Register(apiRequestsTotal); err != nil { + return nil, err + } + if err := registerer.Register(apiRequestDurationSecond); err != nil { + return nil, err + } + return &instruments{ + apiCallsTotal: apiCallsTotal, + apiCallDurationSeconds: apiCallDurationSeconds, + apiCallRetries: apiCallRetries, + apiRequestsTotal: apiRequestsTotal, + apiRequestDurationSecond: apiRequestDurationSecond, + }, nil +}