Skip to content

Commit

Permalink
Add Elastic Fabric Adapter (EFA) metric collection to awscontainerins…
Browse files Browse the repository at this point in the history
…ightreceiver. (open-telemetry#186)

* Add Elastic Fabric Adapter (EFA) metric collection to awscontainerinsightreceiver.

The new component scrapes hardware counters from /sys/class/infiniband on disk.  The layout of that directory is:

/sys/class/infiniband/<device name>
└── ports
    └── 1
        └── hw_counters
            ├── rdma_read_bytes
            ├── rdma_write_bytes
            ├── rdma_write_recv_bytes
            ├── rx_bytes
            ├── rx_drops
            └── tx_bytes

These are cumulative counters and so they are converted to deltas before sending down the pipeline.
We sum up data from all ports.

The device data is joined with data from the kubelet podresources API which tells us which container a given device is
assigned to.

The metrics are reported at container, pod, and node levels.

This commit also refactors some metric decoration code from cadvisor to a common localnode decorator, intended to be
used by any awscontainerinsightreceiver component that gathers metrics from the local node (as oppoosed to e.g. the k8s
control plane scraper).  This is because we want to share the logic of populating PodName, Kubernetes labels, etc.

I also renamed RawContainerInsightsMetric to CIMetricImpl for brevity.
  • Loading branch information
straussb committed Mar 28, 2024
1 parent caa43df commit eed8454
Show file tree
Hide file tree
Showing 51 changed files with 1,638 additions and 359 deletions.
28 changes: 23 additions & 5 deletions internal/aws/containerinsight/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ const (
MinTimeDiff = 50 * time.Microsecond

// Environment variables
HostName = "HOST_NAME"
RunInContainer = "RUN_IN_CONTAINER"
RunAsHostProcessContainer = "RUN_AS_HOST_PROCESS_CONTAINER"
TrueValue = "True"
Expand Down Expand Up @@ -133,6 +134,13 @@ const (
DiskIOWrite = "Write"
DiskIOTotal = "Total"

EfaRdmaReadBytes = "rdma_read_bytes"
EfaRdmaWriteBytes = "rdma_write_bytes"
EfaRdmaWriteRecvBytes = "rdma_write_recv_bytes"
EfaRxBytes = "rx_bytes"
EfaRxDropped = "rx_dropped"
EfaTxBytes = "tx_bytes"

// Define the metric types
TypeCluster = "Cluster"
TypeClusterService = "ClusterService"
Expand All @@ -158,11 +166,14 @@ const (
// Special type for pause container
// because containerd does not set container name pause container name to POD like docker does.
TypeInfraContainer = "InfraContainer"
TypeGpuContainer = "ContainerGPU"
TypeGpuPod = "PodGPU"
TypeGpuNode = "NodeGPU"
TypeGpuCluster = "ClusterGPU"
TypeNeuronContainer = "ContainerNeuron"
TypeContainerGPU = "ContainerGPU"
TypePodGPU = "PodGPU"
TypeNodeGPU = "NodeGPU"
TypeClusterGPU = "ClusterGPU"
TypeContainerNeuron = "ContainerNeuron"
TypeContainerEFA = "ContainerEFA"
TypePodEFA = "PodEFA"
TypeNodeEFA = "NodeEFA"

// unit
UnitBytes = "Bytes"
Expand Down Expand Up @@ -301,5 +312,12 @@ func init() {
ContainerCount: UnitCount,
ContainerRestartCount: UnitCount,
RunningTaskCount: UnitCount,

EfaRdmaReadBytes: UnitBytesPerSec,
EfaRdmaWriteBytes: UnitBytesPerSec,
EfaRdmaWriteRecvBytes: UnitBytesPerSec,
EfaRxBytes: UnitBytesPerSec,
EfaRxDropped: UnitCountPerSec,
EfaTxBytes: UnitBytesPerSec,
}
}
1 change: 1 addition & 0 deletions internal/aws/containerinsight/k8sconst.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ const (
AttributeContainerName = "ContainerName"
AttributeContainerID = "ContainerId"
AttributeGpuDevice = "GpuDevice"
AttributeEfaDevice = "EfaDevice"

PodStatus = "pod_status"
ContainerStatus = "container_status"
Expand Down
30 changes: 27 additions & 3 deletions internal/aws/containerinsight/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,13 @@ func SumFields(fields []map[string]any) map[string]float64 {
// IsNode checks if a type belongs to node level metrics (for EKS)
func IsNode(mType string) bool {
switch mType {
case TypeNode, TypeNodeNet, TypeNodeFS, TypeNodeDiskIO:
case
TypeNode,
TypeNodeDiskIO,
TypeNodeEFA,
TypeNodeFS,
TypeNodeGPU,
TypeNodeNet:
return true
}
return false
Expand All @@ -70,7 +76,12 @@ func IsInstance(mType string) bool {
// IsContainer checks if a type belongs to container level metrics
func IsContainer(mType string) bool {
switch mType {
case TypeContainer, TypeContainerDiskIO, TypeContainerFS:
case
TypeContainer,
TypeContainerDiskIO,
TypeContainerEFA,
TypeContainerFS,
TypeContainerGPU:
return true
}
return false
Expand All @@ -79,7 +90,11 @@ func IsContainer(mType string) bool {
// IsPod checks if a type belongs to container level metrics
func IsPod(mType string) bool {
switch mType {
case TypePod, TypePodNet:
case
TypePod,
TypePodEFA,
TypePodGPU,
TypePodNet:
return true
}
return false
Expand All @@ -100,9 +115,12 @@ func getPrefixByMetricType(mType string) string {
nodePrefix := "node_"
instanceNetPrefix := "instance_interface_"
nodeNetPrefix := "node_interface_"
nodeEfaPrefix := "node_efa_"
podPrefix := "pod_"
podNetPrefix := "pod_interface_"
podEfaPrefix := "pod_efa_"
containerPrefix := "container_"
containerEfaPrefix := "container_efa_"
service := "service_"
cluster := "cluster_"
namespace := "namespace_"
Expand All @@ -128,16 +146,22 @@ func getPrefixByMetricType(mType string) string {
prefix = nodePrefix
case TypeNodeNet:
prefix = nodeNetPrefix
case TypeNodeEFA:
prefix = nodeEfaPrefix
case TypePod:
prefix = podPrefix
case TypePodNet:
prefix = podNetPrefix
case TypePodEFA:
prefix = podEfaPrefix
case TypeContainer:
prefix = containerPrefix
case TypeContainerDiskIO:
prefix = containerPrefix
case TypeContainerFS:
prefix = containerPrefix
case TypeContainerEFA:
prefix = containerEfaPrefix
case TypeService:
prefix = service
case TypeCluster:
Expand Down
50 changes: 48 additions & 2 deletions internal/aws/containerinsight/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,11 @@ func TestMetricName(t *testing.T) {

func TestIsNode(t *testing.T) {
assert.Equal(t, true, IsNode(TypeNode))
assert.Equal(t, true, IsNode(TypeNodeNet))
assert.Equal(t, true, IsNode(TypeNodeFS))
assert.Equal(t, true, IsNode(TypeNodeDiskIO))
assert.Equal(t, true, IsNode(TypeNodeEFA))
assert.Equal(t, true, IsNode(TypeNodeFS))
assert.Equal(t, true, IsNode(TypeNodeGPU))
assert.Equal(t, true, IsNode(TypeNodeNet))
assert.Equal(t, false, IsNode(TypePod))
}

Expand All @@ -90,12 +92,16 @@ func TestIsInstance(t *testing.T) {
func TestIsContainer(t *testing.T) {
assert.Equal(t, true, IsContainer(TypeContainer))
assert.Equal(t, true, IsContainer(TypeContainerDiskIO))
assert.Equal(t, true, IsContainer(TypeContainerEFA))
assert.Equal(t, true, IsContainer(TypeContainerGPU))
assert.Equal(t, true, IsContainer(TypeContainerFS))
assert.Equal(t, false, IsContainer(TypePod))
}

func TestIsPod(t *testing.T) {
assert.Equal(t, true, IsPod(TypePod))
assert.Equal(t, true, IsPod(TypePodEFA))
assert.Equal(t, true, IsPod(TypePodGPU))
assert.Equal(t, true, IsPod(TypePodNet))
assert.Equal(t, false, IsPod(TypeInstance))
}
Expand Down Expand Up @@ -866,3 +872,43 @@ func TestConvertToOTLPMetricsForPodContainerStatusMetrics(t *testing.T) {
md = ConvertToOTLPMetrics(fields, tags, zap.NewNop())
checkMetricsAreExpected(t, md, fields, tags, expectedUnits)
}

func TestConvertToOTLPMetricsForPodEfaMetrics(t *testing.T) {
var fields map[string]any
var expectedUnits map[string]string
var tags map[string]string
var md pmetric.Metrics
now := time.Now()
timestamp := strconv.FormatInt(now.UnixNano(), 10)

fields = map[string]any{
"pod_efa_rdma_read_bytes": uint64(31654),
"pod_efa_rdma_write_bytes": uint64(45548),
"pod_efa_rdma_write_recv_bytes": uint64(63253),
"pod_efa_rx_bytes": uint64(33543),
"pod_efa_rx_dropped": uint64(40089),
"pod_efa_tx_bytes": uint64(42508),
}
expectedUnits = map[string]string{
"pod_efa_rdma_read_bytes": UnitBytesPerSec,
"pod_efa_rdma_write_bytes": UnitBytesPerSec,
"pod_efa_rdma_write_recv_bytes": UnitBytesPerSec,
"pod_efa_rx_bytes": UnitBytesPerSec,
"pod_efa_rx_dropped": UnitCountPerSec,
"pod_efa_tx_bytes": UnitBytesPerSec,
}
tags = map[string]string{
"ClusterName": "eks-aoc",
"InstanceId": "i-01bf9fb097cbf3205",
"InstanceType": "t2.xlarge",
"Namespace": "amazon-cloudwatch",
"NodeName": "ip-192-168-12-170.ec2.internal",
"PodName": "cloudwatch-agent",
"ContainerName": "cloudwatch-agent",
"Type": "PodEFA",
"Version": "0",
"Timestamp": timestamp,
}
md = ConvertToOTLPMetrics(fields, tags, zap.NewNop())
checkMetricsAreExpected(t, md, fields, tags, expectedUnits)
}
Loading

0 comments on commit eed8454

Please sign in to comment.