Skip to content

Commit

Permalink
Support getting container service metrics, move metrics to a metrics …
Browse files Browse the repository at this point in the history
…endpoint, consolidate query processing (#11)

* Support getting container service metrics, move metrics to a metrics endpoint, consolidate query processing
  • Loading branch information
fishnix authored Apr 16, 2020
1 parent 2ef3a31 commit c6e352a
Show file tree
Hide file tree
Showing 7 changed files with 283 additions and 155 deletions.
34 changes: 14 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ GET /v1/cost/{account}/spaces/{spaceid}[?start=2019-10-01&end=2019-10-30]
GET /v1/cost/{account}/instances/{id}/metrics/graph.png?metric={metric1}[&metric={metric2}&start=-P1D&end=PT0H&period=300]
GET /v1/cost/{account}/instances/{id}/metrics/graph?metric={metric1}[&metric={metric2}&start=-P1D&end=PT0H&period=300]
GET /v1/metrics/{account}/instances/{id}/graph?metric={metric1}[&metric={metric2}&start=-P1D&end=PT0H&period=300]
GET /v1/metrics/{account}/clusters/{cluster}/services/{service}/graph?metric={metric1}[&metric={metric2}&start=-P1D&end=PT0H&period=300]
```

## Usage
Expand Down Expand Up @@ -56,34 +59,21 @@ GET /v1/cost/{account}/spaces/{spaceid}
]
```

### Get cloudwatch metrics widgets for an instance ID
### Get cloudwatch metrics widgets URL from S3 for an instance ID

This will get the passed metric(s) for the passed instance ID in a `image/png` graph for the past 1 day by default. It's also
This will get the passed metric(s) for the passed instance ID or container cluster/service in a `image/png` graph for the past 1 day by default, cache it in S3
and return the URL. URLs are cached in the API for 5 minutes, the images should be purged from the S3 cache on a schedule. It's also
possible to pass the start time, end time and period (e. `300s` for 300 seconds, `5m` for 5 minutes). Query parameters must follow
the [CloudWatch Metric Widget Structure](https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/CloudWatch-Metric-Widget-Structure.html).

#### Request

```
GET /v1/cost/{account}/instances/{id}/metrics/graph.png?metric={metric1}[&metric={metric2}&....]
GET /v1/cost/{account}/instances/{id}/metrics/graph.png?metric={metric1}[&metric={metric2}&start={start}&end={end}&period={period}]
```

#### Response

![WidgetExample](/img/example_response.png?raw=true)

### Get cloudwatch metrics widgets URL from S3 for an instance ID

This will get the passed metric(s) for the passed instance ID in a `image/png` graph for the past 1 day by default, cache it in S3
and return the URL. URLs are cached in the API for 5 minutes, the images should be purged from the S3 cache on a schedule. It's also
possible to pass the start time, end time and period (e. `300s` for 300 seconds, `5m` for 5 minutes). Query parameters must follow the [CloudWatch Metric Widget Structure](https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/CloudWatch-Metric-Widget-Structure.html).

#### Request
GET /v1/metrics/{account}/instances/{id}/graph?metric={metric1}[&metric={metric2}&....]
GET /v1/metrics/{account}/instances/{id}/graph?metric={metric1}[&metric={metric2}&start={start}&end={end}&period={period}]
```
GET /v1/cost/{account}/instances/{id}/metrics/graph?metric={metric1}[&metric={metric2}&....]
GET /v1/cost/{account}/instances/{id}/metrics/graph?metric={metric1}[&metric={metric2}&start={start}&end={end}&period={period}]
GET /v1/metrics/{account}/clusters/{cluster}/services/{service}/graph?metric={metric1}[&metric={metric2}&....]
GET /v1/metrics/{account}/clusters/{cluster}/services/{service}/graph?metric={metric1}[&metric={metric2}&start={start}&end={end}&period={period}]
```

#### Response
Expand All @@ -94,6 +84,10 @@ GET /v1/cost/{account}/instances/{id}/metrics/graph?metric={metric1}[&metric={me
}
```

with an image like this

![WidgetExample](/img/example_response.png?raw=true)

## Image Caching

When image urls are returned for metrics graph data, they are cached in the image cache. The default implementation of this cache is an S3 bucket where the URLs are returned in the response (and cached in the data cache).
Expand Down
180 changes: 109 additions & 71 deletions api/handlers_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,38 @@ package api
import (
"fmt"
"net/http"
"sort"
"strings"
"time"

"github.com/YaleSpinup/cost-api/apierror"
"github.com/YaleSpinup/cost-api/cloudwatch"
"github.com/gorilla/mux"
"github.com/pkg/errors"
log "github.com/sirupsen/logrus"
)

// MetricsGetImageHandler gets metrics from cloudwatch
func (s *server) MetricsGetImageHandler(w http.ResponseWriter, r *http.Request) {
// GetEC2MetricsURLHandler gets metrics from cloudwatch and returns a link to the image
func (s *server) GetEC2MetricsURLHandler(w http.ResponseWriter, r *http.Request) {
w = LogWriter{w}
vars := mux.Vars(r)
account := vars["account"]
id := vars["id"]
instanceId := vars["id"]

cwService, ok := s.cloudwatchServices[account]
if !ok {
msg := fmt.Sprintf("cloudwatch service not found for account: %s", account)
handleError(w, apierror.New(apierror.ErrNotFound, msg, nil))
return
}
log.Debugf("found cloudwatch service %+v", cwService)

resultCache, ok := s.resultCache[account]
if !ok {
msg := fmt.Sprintf("result cache not found for account: %s", account)
handleError(w, apierror.New(apierror.ErrNotFound, msg, nil))
return
}
log.Debugf("found cost explorer result cache %+v", *resultCache)

queries := r.URL.Query()
metrics := queries["metric"]
Expand All @@ -27,90 +43,61 @@ func (s *server) MetricsGetImageHandler(w http.ResponseWriter, r *http.Request)
return
}

period := int64(300)
if p, ok := vars["period"]; ok && p != "" {
dur, err := time.ParseDuration(p)
if err != nil {
msg := fmt.Sprintf("failed to parse period as duration: %s", err)
handleError(w, apierror.New(apierror.ErrNotFound, msg, nil))
return
}

period = int64(dur.Seconds())
}

start := "-P1D"
if s, ok := vars["start"]; ok {
start = s
req := cloudwatch.MetricsRequest{}
if err := parseQuery(r, req); err != nil {
handleError(w, apierror.New(apierror.ErrBadRequest, "failed to parse query", err))
return
}

end := "PT0H"
if e, ok := vars["end"]; ok {
end = e
}
key := fmt.Sprintf("%s/%s/%s/%v/%v/%v", Org, instanceId, strings.Join(metrics, "-"), req["start"], req["end"], req["period"])
hashedCacheKey := s.imageCache.HashedKey(key)
if res, expire, ok := resultCache.GetWithExpiration(hashedCacheKey); ok {
log.Debugf("found cached object: %s", res)

cwService, ok := s.cloudwatchServices[account]
if !ok {
msg := fmt.Sprintf("cloudwatch service not found for account: %s", account)
handleError(w, apierror.New(apierror.ErrNotFound, msg, nil))
return
if body, ok := res.([]byte); ok {
w.Header().Set("X-Cache-Hit", "true")
w.Header().Set("X-Cache-Expire", fmt.Sprintf("%0.fs", time.Until(expire).Seconds()))
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.Write(body)
return
}
}
log.Debugf("found cloudwatch service %+v", cwService)

cwMetrics := []cloudwatch.Metric{}
for _, m := range metrics {
cwMetrics = append(cwMetrics, cloudwatch.Metric{"AWS/EC2", m, "InstanceId", id})
cwMetrics = append(cwMetrics, cloudwatch.Metric{"AWS/EC2", m, "InstanceId", instanceId})
}
req["metrics"] = cwMetrics

log.Debugf("getting metrics %+v, start: %s, end: %s with period: %ds", cwMetrics, start, end, period)
out, err := cwService.GetMetricWidget(r.Context(), cwMetrics, period, start, end)
log.Debugf("getting metrics with request %+v", req)
image, err := cwService.GetMetricWidget(r.Context(), req)
if err != nil {
log.Errorf("failed getting metrics widget image: %s", err)
handleError(w, err)
return
}

w.Header().Set("Content-Type", "image/png")
meta, err := s.imageCache.Save(r.Context(), hashedCacheKey, image)
if err != nil {
log.Errorf("failed saving metrics widget image to cache: %s", err)
handleError(w, err)
return
}
resultCache.Set(hashedCacheKey, meta, 300*time.Second)

w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.Write(out)
w.Write(meta)
}

// MetricsGetURLHandler gets metrics from cloudwatch and returns a link to the image
func (s *server) MetricsGetImageUrlHandler(w http.ResponseWriter, r *http.Request) {
// GetECSMetricsURLHandler gets metrics from cloudwatch and returns a link to the image
func (s *server) GetECSMetricsURLHandler(w http.ResponseWriter, r *http.Request) {
w = LogWriter{w}
vars := mux.Vars(r)
account := vars["account"]
id := vars["id"]

queries := r.URL.Query()
metrics := queries["metric"]
if len(metrics) == 0 {
handleError(w, apierror.New(apierror.ErrBadRequest, "at least one metric is required", nil))
return
}
sort.Strings(metrics)

period := int64(300)
if p, ok := vars["period"]; ok && p != "" {
dur, err := time.ParseDuration(p)
if err != nil {
msg := fmt.Sprintf("failed to parse period as duration: %s", err)
handleError(w, apierror.New(apierror.ErrNotFound, msg, nil))
return
}

period = int64(dur.Seconds())
}

start := "-P1D"
if s, ok := vars["start"]; ok && s != "" {
start = s
}

end := "PT0H"
if e, ok := vars["end"]; ok && e != "" {
end = e
}
cluster := vars["cluster"]
service := vars["service"]

cwService, ok := s.cloudwatchServices[account]
if !ok {
Expand All @@ -128,7 +115,20 @@ func (s *server) MetricsGetImageUrlHandler(w http.ResponseWriter, r *http.Reques
}
log.Debugf("found cost explorer result cache %+v", *resultCache)

key := fmt.Sprintf("%s/%s/%s/%s/%s/%d", Org, id, strings.Join(metrics, "-"), start, end, period)
queries := r.URL.Query()
metrics := queries["metric"]
if len(metrics) == 0 {
handleError(w, apierror.New(apierror.ErrBadRequest, "at least one metric is required", nil))
return
}

req := cloudwatch.MetricsRequest{}
if err := parseQuery(r, req); err != nil {
handleError(w, apierror.New(apierror.ErrBadRequest, "failed to parse query", err))
return
}

key := fmt.Sprintf("%s/%s/%s/%v/%v/%v", Org, fmt.Sprintf("%s-%s", cluster, service), strings.Join(metrics, "-"), req["start"], req["end"], req["period"])
hashedCacheKey := s.imageCache.HashedKey(key)
if res, expire, ok := resultCache.GetWithExpiration(hashedCacheKey); ok {
log.Debugf("found cached object: %s", res)
Expand All @@ -145,11 +145,12 @@ func (s *server) MetricsGetImageUrlHandler(w http.ResponseWriter, r *http.Reques

cwMetrics := []cloudwatch.Metric{}
for _, m := range metrics {
cwMetrics = append(cwMetrics, cloudwatch.Metric{"AWS/EC2", m, "InstanceId", id})
cwMetrics = append(cwMetrics, cloudwatch.Metric{"AWS/ECS", m, "ClusterName", cluster, "ServiceName", service})
}
req["metrics"] = cwMetrics

log.Debugf("getting metrics %+v, start: %s, end: %s with period: %ds", cwMetrics, start, end, period)
image, err := cwService.GetMetricWidget(r.Context(), cwMetrics, period, start, end)
log.Debugf("getting metrics with request %+v", req)
image, err := cwService.GetMetricWidget(r.Context(), req)
if err != nil {
log.Errorf("failed getting metrics widget image: %s", err)
handleError(w, err)
Expand All @@ -168,3 +169,40 @@ func (s *server) MetricsGetImageUrlHandler(w http.ResponseWriter, r *http.Reques
w.WriteHeader(http.StatusOK)
w.Write(meta)
}

func parseQuery(r *http.Request, request cloudwatch.MetricsRequest) error {
log.SetLevel(log.DebugLevel)
queries := r.URL.Query()
log.Debugf("parsing queries: %+v", queries)

stat := "Average"
if s, ok := queries["stat"]; ok {
stat = s[0]
}
request["stat"] = stat

period := int64(300)
if p, ok := queries["period"]; ok && p[0] != "" {
dur, err := time.ParseDuration(p[0])
if err != nil {
return errors.Wrap(err, "failed to parse period as duration")
}

period = int64(dur.Seconds())
}
request["period"] = period

start := "-P1D"
if s, ok := queries["start"]; ok {
start = s[0]
}
request["start"] = start

end := "PT0H"
if e, ok := queries["end"]; ok {
end = e[0]
}
request["end"] = end

return nil
}
Loading

0 comments on commit c6e352a

Please sign in to comment.