【CloudFormation】EKSの全てのNode、Podを自動でCloudWatchダッシュボードに追加する

本記事では、CloudFormationによる、EKS のメトリクスをCloudWatchダッシュボードに追加する方法について、コード付きで解説します。

※下記写真は例です。

S3 Bucket Policy

前提

EKS、Podは事前に作成する。

EKSの名前は下記。

・EKS:test-cluster

説明

検索式を使用することで、検索式に一致するメトリクスを自動的に追加するグラフを作成可能。

例えば、以下のような検索式を使用してグラフを表示することで、ClusterName=“test-cluster"のディメンションを持つnode_filesystem_utilizationメトリクスを表示させることが可能。

test-clusterにて新規にEC2インスタンスが起動した場合も、グラフも追従させることが可能。

SEARCH('{ContainerInsights,ClusterName} ClusterName="test-cluster" MetricName="node_filesystem_utilization"', 'Average', 60)

各サービスのmetrics、名前空間(namespace)は [2] のようなドキュメントから確認することが出来る。

[1] グラフで検索式を使用する

[2] Amazon EKS および Kubernetes Container Insights のメトリクス

実際のコード

AWSTemplateFormatVersion: '2010-09-09'

#####################################################################
#
# Parameters
#
#####################################################################
Parameters:
  EKSClusterName:
    Description: EKS Cluster Name
    Type: String
    Default: test-cluster   

#####################################################################
#
# Dashboard
#
#####################################################################
Resources:
  CWDashboard:
    Type: AWS::CloudWatch::Dashboard
    Properties:
      DashboardName: !Sub '${AWS::StackName}'
      DashboardBody: !Sub |
        {
          "widgets": [               

              {                 
                  "type": "metric",
                  "x": 0,
                  "y": 0,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                          [ "ContainerInsights", "cluster_failed_node_count", "ClusterName", "${EKSClusterName}" ]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS cluster_failed_node_count"
                  }
              },
              {                 
                  "type": "metric",
                  "x": 12,
                  "y": 6,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                          [ "ContainerInsights", "cluster_node_count", "ClusterName", "${EKSClusterName}" ]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS cluster_node_count"
                  }
              },                        
              {       
                  "type": "metric",
                  "x": 0,
                  "y": 6,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,ClusterName,NodeName,InstanceId} ClusterName=\"${EKSClusterName}\" MetricName=\"node_cpu_utilization\"', 'Average', 60)"}]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS node_cpu_utilization"
                  }
              },  
              {       
                  "type": "metric",
                  "x": 12,
                  "y": 12,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,ClusterName,NodeName,InstanceId} ClusterName=\"${EKSClusterName}\" MetricName=\"node_filesystem_utilization\"', 'Average', 60)"}]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS node_filesystem_utilization"
                  }
              },                                      
              {       
                  "type": "metric",
                  "x": 0,
                  "y": 12,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,ClusterName,NodeName,InstanceId} ClusterName=\"${EKSClusterName}\" MetricName=\"node_memory_utilization\"', 'Average', 60)"}]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS node_memory_utilization"
                  }
              }, 
              {       
                  "type": "metric",
                  "x": 12,
                  "y": 18,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,ClusterName,NodeName,InstanceId} ClusterName=\"${EKSClusterName}\" MetricName=\"node_number_of_running_containers\"', 'Average', 60)"}]            
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS node_number_of_running_containers"
                  }
              }, 
              {       
                  "type": "metric",
                  "x": 0,
                  "y": 18,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,ClusterName,NodeName,InstanceId} ClusterName=\"${EKSClusterName}\" MetricName=\"node_number_of_running_pods\"', 'Average', 60)"}]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS node_number_of_running_pods"
                  }
              },
              {       
                  "type": "metric",
                  "x": 12,
                  "y": 24,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,PodName,Namespace,ClusterName} ClusterName=\"${EKSClusterName}\" MetricName=\"pod_cpu_utilization\"', 'Average', 60)"}]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS pod_cpu_utilization"
                  }
              },      
              {       
                  "type": "metric",
                  "x": 0,
                  "y": 24,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,PodName,Namespace,ClusterName} ClusterName=\"${EKSClusterName}\" MetricName=\"pod_cpu_utilization_over_pod_limit\"', 'Average', 60)"}]            
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS pod_cpu_utilization_over_pod_limit"
                  }
              },    
              {       
                  "type": "metric",
                  "x": 12,
                  "y": 30,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,PodName,Namespace,ClusterName} ClusterName=\"${EKSClusterName}\" MetricName=\"pod_memory_utilization\"', 'Average', 60)"}]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS pod_memory_utilization"
                  }
              },   
              {       
                  "type": "metric",
                  "x": 0,
                  "y": 30,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,PodName,Namespace,ClusterName} ClusterName=\"${EKSClusterName}\" MetricName=\"pod_memory_utilization_over_pod_limit\"', 'Average', 60)"}]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS pod_memory_utilization_over_pod_limit"
                  }
              },
              {       
                  "type": "metric",
                  "x": 12,
                  "y": 36,
                  "width": 12,
                  "height": 6,
                  "properties": {
                      "view": "timeSeries",
                      "stacked": false,
                      "metrics": [
                      [{"expression": "SEARCH('{ContainerInsights,PodName,Namespace,ClusterName} ClusterName=\"${EKSClusterName}\" MetricName=\"pod_number_of_container_restarts\"', 'Average', 60)"}]
                      ],
                      "region": "ap-northeast-1",
                      "title": "EKS pod_number_of_container_restarts"
                  }
              }                                                   
          ]
        }