Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding system metrics support #97

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@ RUN go mod download
# Copy the go source
COPY cmd/main.go cmd/main.go
COPY api/ api/
COPY pkg/controllers/ pkg/controllers/
COPY pkg/cert/ pkg/cert/
COPY pkg/webhooks/ pkg/webhooks/
COPY pkg/utils pkg/utils
COPY pkg/ pkg/

# Build
# the GOARCH has not a default value to allow the binary be built according to the host where the command
Expand Down
3 changes: 3 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
leaderworkersetv1 "sigs.k8s.io/lws/api/leaderworkerset/v1"
"sigs.k8s.io/lws/pkg/cert"
"sigs.k8s.io/lws/pkg/controllers"
"sigs.k8s.io/lws/pkg/metrics"
"sigs.k8s.io/lws/pkg/webhooks"
//+kubebuilder:scaffold:imports
)
Expand Down Expand Up @@ -76,6 +77,8 @@ func main() {
kubeConfig.QPS = float32(qps)
kubeConfig.Burst = burst

metrics.Register()

mgr, err := ctrl.NewManager(kubeConfig, ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{BindAddress: metricsAddr},
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/onsi/ginkgo/v2 v2.17.1
github.com/onsi/gomega v1.32.0
github.com/open-policy-agent/cert-controller v0.10.1
github.com/prometheus/client_golang v1.18.0
k8s.io/api v0.29.3
k8s.io/apiextensions-apiserver v0.29.3
k8s.io/apimachinery v0.29.3
Expand Down Expand Up @@ -48,7 +49,6 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.18.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
Expand Down
17 changes: 15 additions & 2 deletions pkg/controllers/leaderworkerset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/reconcile"

leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1"
"sigs.k8s.io/lws/pkg/metrics"
"sigs.k8s.io/lws/pkg/utils"
podutils "sigs.k8s.io/lws/pkg/utils/pod"
statefulsetutils "sigs.k8s.io/lws/pkg/utils/statefulset"
Expand Down Expand Up @@ -348,7 +349,7 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l
readyCount := 0
updatedCount := 0
templateHash := utils.LeaderWorkerTemplateHash(lws)

var readyLeaderPods []corev1.Pod
// Iterate through all statefulsets.
for _, sts := range lwssts.Items {
if sts.Name == lws.Name {
Expand All @@ -367,7 +368,7 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l
}
if podutils.PodRunningAndReady(leaderPod) {
readyCount++

readyLeaderPods = append(readyLeaderPods, leaderPod)
if sts.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash && leaderPod.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash {
updatedCount++
}
Expand All @@ -389,6 +390,9 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l
updateCondition := setCondition(lws, condition)
// if condition changed, record events
if updateCondition {
if updatedCount == int(*lws.Spec.Replicas) {
metrics.ReplicaReadyStatus(readyLeaderPods, getLastTransitionTime(string(leaderworkerset.LeaderWorkerSetProgressing), lws))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the start time here is when the lws just progress, there might be a possibility that the leader pod is not scheduled.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Asked a similar question in other comment: what is the replica latency actually measuring? I thought it was from lws creation to when the replica is ready

}
r.Record.Eventf(lws, corev1.EventTypeNormal, condition.Reason, condition.Message+fmt.Sprintf(", with %d groups ready of total %d groups", readyCount, int(*lws.Spec.Replicas)))
}
return updateStatus || updateCondition, nil
Expand Down Expand Up @@ -569,3 +573,12 @@ func templateUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerS
func replicasUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerSet) bool {
return *sts.Spec.Replicas != *lws.Spec.Replicas
}

func getLastTransitionTime(conditionType string, lws *leaderworkerset.LeaderWorkerSet) metav1.Time {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe move this to a util file?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was thinking about that as well, can create an lws utils file similar to the one that exists for pods, and move this function, templateUpdated and replicasUpdated to it in a different PR.

for _, condition := range lws.Status.Conditions {
if condition.Type == conditionType {
return condition.LastTransitionTime
}
}
return metav1.Now()
}
2 changes: 2 additions & 0 deletions pkg/controllers/pod_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate"

leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1"
"sigs.k8s.io/lws/pkg/metrics"
acceleratorutils "sigs.k8s.io/lws/pkg/utils/accelerators"
podutils "sigs.k8s.io/lws/pkg/utils/pod"
statefulsetutils "sigs.k8s.io/lws/pkg/utils/statefulset"
Expand Down Expand Up @@ -174,6 +175,7 @@ func (r *PodReconciler) handleRestartPolicy(ctx context.Context, pod corev1.Pod,
}); err != nil {
return false, err
}
metrics.RecreatingGroup(leader.Name)
return true, nil
}

Expand Down
78 changes: 78 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
Copyright 2023.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
podutils "sigs.k8s.io/lws/pkg/utils/pod"

"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
rollingUpdateDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: "lws",
Name: "rolling_update_duration",
Help: "Duration of rolling updates",
}, []string{"hash"},
)

recreateGroupTimes = prometheus.NewCounterVec(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not called anywhere else right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, adding it to register it, but waiting on #106 to implement it.

prometheus.CounterOpts{
Subsystem: "lws",
Name: "recreate_group_times",
Help: "number of times a group has been recreated",
}, []string{"leadername"},
)

replicaReadyStatusDuration = prometheus.NewHistogramVec(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a comment here about what is the metrics record, like "the latency for a pod group to be ready after creation"

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but overall, I start to think about whether it's necessary to have this metrics for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the help attribute of the object describes what it records, but I can add a comment for it as well

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need such a metric. The time it takes for the pod to become ready is not a characteristic of the lws controller.

prometheus.HistogramOpts{
Subsystem: "lws",
Name: "replica_ready_status_duration",
Help: "latency for each replica to be scheduled and become ready",
}, []string{"leadername"},
)
)

func RollingUpdate(hash string, duration time.Duration) {
rollingUpdateDuration.WithLabelValues(hash).Observe(duration.Seconds())
}

func RecreatingGroup(leaderName string) {
recreateGroupTimes.WithLabelValues(leaderName).Inc()
}

func ReplicaReadyStatus(readyPods []corev1.Pod, startTime metav1.Time) {
for _, pod := range readyPods {
readyTime := podutils.PodReadyConditionLastTransitionTime(pod).Time
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when the leader pod is ready, its worker statefulset may not be ready right? the latency here calculates the leader pods' latency to be ready

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we may need to pass the worker sts and leader pod as a pair and make sure that latency=worker_sts_ready-leader_pod_created, wdyt

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The LeaderPod is only added into the list that is passed if the worker sts is ready, that is checked here

I guess I'm a bit confused with what the replica latency measures, I thought it was from when the lws object was created/edited to when the replica is ready. So is it from when the leader statefulset is created to when the worker sts is ready?

latency := readyTime.Sub(startTime.Time)
replicaReadyStatusDuration.WithLabelValues(pod.Name).Observe(latency.Seconds())
}
}

func Register() {
metrics.Registry.MustRegister(
rollingUpdateDuration,
recreateGroupTimes,
replicaReadyStatusDuration,
)
}
28 changes: 28 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package metrics

import (
"testing"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)

func TestRecreatingGroup(t *testing.T) {
prometheus.MustRegister(recreateGroupTimes)

RecreatingGroup("lws-sample-0")
RecreatingGroup("lws-sample-1")
RecreatingGroup("lws-sample-0")

if count := testutil.CollectAndCount(recreateGroupTimes); count != 2 {
t.Errorf("Expecting %d metrics, got: %d", 2, count)
}

if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-0")); count != float64(2) {
t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-0", 2, count)
}

if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-1")); count != float64(1) {
t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-1", 1, count)
}
}
5 changes: 5 additions & 0 deletions pkg/utils/pod/pod_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package pod

import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1"
)
Expand Down Expand Up @@ -56,6 +57,10 @@ func PodRunningAndReady(pod corev1.Pod) bool {
return pod.Status.Phase == corev1.PodRunning && podReady(pod)
}

func PodReadyConditionLastTransitionTime(pod corev1.Pod) metav1.Time {
return getPodReadyCondition(pod.Status).LastTransitionTime
}

func podReady(pod corev1.Pod) bool {
return podReadyConditionTrue(pod.Status)
}
Expand Down