kubernetes-sigs · Edwinhr716 · Apr 11, 2024 · Apr 16, 2024 · Apr 17, 2024 · liurupeng
diff --git a/Dockerfile b/Dockerfile
@@ -19,10 +19,7 @@ RUN go mod download
 # Copy the go source
 COPY cmd/main.go cmd/main.go
 COPY api/ api/
-COPY pkg/controllers/ pkg/controllers/
-COPY pkg/cert/ pkg/cert/
-COPY pkg/webhooks/ pkg/webhooks/
-COPY pkg/utils pkg/utils
+COPY pkg/ pkg/
 
 # Build
 # the GOARCH has not a default value to allow the binary be built according to the host where the command

diff --git a/cmd/main.go b/cmd/main.go
@@ -35,6 +35,7 @@ import (
 	leaderworkersetv1 "sigs.k8s.io/lws/api/leaderworkerset/v1"
 	"sigs.k8s.io/lws/pkg/cert"
 	"sigs.k8s.io/lws/pkg/controllers"
+	"sigs.k8s.io/lws/pkg/metrics"
 	"sigs.k8s.io/lws/pkg/webhooks"
 	//+kubebuilder:scaffold:imports
 )
@@ -76,6 +77,8 @@ func main() {
 	kubeConfig.QPS = float32(qps)
 	kubeConfig.Burst = burst
 
+	metrics.Register()
+
 	mgr, err := ctrl.NewManager(kubeConfig, ctrl.Options{
 		Scheme:                 scheme,
 		Metrics:                metricsserver.Options{BindAddress: metricsAddr},

diff --git a/go.mod b/go.mod
@@ -7,6 +7,7 @@ require (
 	github.com/onsi/ginkgo/v2 v2.17.1
 	github.com/onsi/gomega v1.32.0
 	github.com/open-policy-agent/cert-controller v0.10.1
+	github.com/prometheus/client_golang v1.18.0
 	k8s.io/api v0.29.3
 	k8s.io/apiextensions-apiserver v0.29.3
 	k8s.io/apimachinery v0.29.3
@@ -48,7 +49,6 @@ require (
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
-	github.com/prometheus/client_golang v1.18.0 // indirect
 	github.com/prometheus/client_model v0.5.0 // indirect
 	github.com/prometheus/common v0.45.0 // indirect
 	github.com/prometheus/procfs v0.12.0 // indirect

diff --git a/pkg/controllers/leaderworkerset_controller.go b/pkg/controllers/leaderworkerset_controller.go
@@ -41,6 +41,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 
 	leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1"
+	"sigs.k8s.io/lws/pkg/metrics"
 	"sigs.k8s.io/lws/pkg/utils"
 	podutils "sigs.k8s.io/lws/pkg/utils/pod"
 	statefulsetutils "sigs.k8s.io/lws/pkg/utils/statefulset"
@@ -348,7 +349,7 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l
 	readyCount := 0
 	updatedCount := 0
 	templateHash := utils.LeaderWorkerTemplateHash(lws)
-
+	var readyLeaderPods []corev1.Pod
 	// Iterate through all statefulsets.
 	for _, sts := range lwssts.Items {
 		if sts.Name == lws.Name {
@@ -367,7 +368,7 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l
 			}
 			if podutils.PodRunningAndReady(leaderPod) {
 				readyCount++
-
+				readyLeaderPods = append(readyLeaderPods, leaderPod)
 				if sts.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash && leaderPod.Labels[leaderworkerset.TemplateRevisionHashKey] == templateHash {
 					updatedCount++
 				}
@@ -389,6 +390,9 @@ func (r *LeaderWorkerSetReconciler) updateConditions(ctx context.Context, lws *l
 	updateCondition := setCondition(lws, condition)
 	// if condition changed, record events
 	if updateCondition {
+		if updatedCount == int(*lws.Spec.Replicas) {
+			metrics.ReplicaReadyStatus(readyLeaderPods, getLastTransitionTime(string(leaderworkerset.LeaderWorkerSetProgressing), lws))
+		}
 		r.Record.Eventf(lws, corev1.EventTypeNormal, condition.Reason, condition.Message+fmt.Sprintf(", with %d groups ready of total %d groups", readyCount, int(*lws.Spec.Replicas)))
 	}
 	return updateStatus || updateCondition, nil
@@ -569,3 +573,12 @@ func templateUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerS
 func replicasUpdated(sts *appsv1.StatefulSet, lws *leaderworkerset.LeaderWorkerSet) bool {
 	return *sts.Spec.Replicas != *lws.Spec.Replicas
 }
+
+func getLastTransitionTime(conditionType string, lws *leaderworkerset.LeaderWorkerSet) metav1.Time {
+	for _, condition := range lws.Status.Conditions {
+		if condition.Type == conditionType {
+			return condition.LastTransitionTime
+		}
+	}
+	return metav1.Now()
+}
diff --git a/pkg/controllers/pod_controller.go b/pkg/controllers/pod_controller.go
@@ -39,6 +39,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/predicate"
 
 	leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1"
+	"sigs.k8s.io/lws/pkg/metrics"
 	acceleratorutils "sigs.k8s.io/lws/pkg/utils/accelerators"
 	podutils "sigs.k8s.io/lws/pkg/utils/pod"
 	statefulsetutils "sigs.k8s.io/lws/pkg/utils/statefulset"
@@ -174,6 +175,7 @@ func (r *PodReconciler) handleRestartPolicy(ctx context.Context, pod corev1.Pod,
 	}); err != nil {
 		return false, err
 	}
+	metrics.RecreatingGroup(leader.Name)
 	return true, nil
 }
 

diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
@@ -0,0 +1,78 @@
+/*
+Copyright 2023.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"time"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	podutils "sigs.k8s.io/lws/pkg/utils/pod"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"sigs.k8s.io/controller-runtime/pkg/metrics"
+)
+
+var (
+	rollingUpdateDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "lws",
+			Name:      "rolling_update_duration",
+			Help:      "Duration of rolling updates",
+		}, []string{"hash"},
+	)
+
+	recreateGroupTimes = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: "lws",
+			Name:      "recreate_group_times",
+			Help:      "number of times a group has been recreated",
+		}, []string{"leadername"},
+	)
+
+	replicaReadyStatusDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Subsystem: "lws",
+			Name:      "replica_ready_status_duration",
+			Help:      "latency for each replica to be scheduled and become ready",
+		}, []string{"leadername"},
+	)
+)
+
+func RollingUpdate(hash string, duration time.Duration) {
+	rollingUpdateDuration.WithLabelValues(hash).Observe(duration.Seconds())
+}
+
+func RecreatingGroup(leaderName string) {
+	recreateGroupTimes.WithLabelValues(leaderName).Inc()
+}
+
+func ReplicaReadyStatus(readyPods []corev1.Pod, startTime metav1.Time) {
+	for _, pod := range readyPods {
+		readyTime := podutils.PodReadyConditionLastTransitionTime(pod).Time
+		latency := readyTime.Sub(startTime.Time)
+		replicaReadyStatusDuration.WithLabelValues(pod.Name).Observe(latency.Seconds())
+	}
+}
+
+func Register() {
+	metrics.Registry.MustRegister(
+		rollingUpdateDuration,
+		recreateGroupTimes,
+		replicaReadyStatusDuration,
+	)
+}
diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go
@@ -0,0 +1,28 @@
+package metrics
+
+import (
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
+)
+
+func TestRecreatingGroup(t *testing.T) {
+	prometheus.MustRegister(recreateGroupTimes)
+
+	RecreatingGroup("lws-sample-0")
+	RecreatingGroup("lws-sample-1")
+	RecreatingGroup("lws-sample-0")
+
+	if count := testutil.CollectAndCount(recreateGroupTimes); count != 2 {
+		t.Errorf("Expecting %d metrics, got: %d", 2, count)
+	}
+
+	if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-0")); count != float64(2) {
+		t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-0", 2, count)
+	}
+
+	if count := testutil.ToFloat64(recreateGroupTimes.WithLabelValues("lws-sample-1")); count != float64(1) {
+		t.Errorf("Expecting %s to have value %d, but got %f", "lws-sample-1", 1, count)
+	}
+}
diff --git a/pkg/utils/pod/pod_utils.go b/pkg/utils/pod/pod_utils.go
@@ -18,6 +18,7 @@ package pod
 
 import (
 	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1"
 )
@@ -56,6 +57,10 @@ func PodRunningAndReady(pod corev1.Pod) bool {
 	return pod.Status.Phase == corev1.PodRunning && podReady(pod)
 }
 
+func PodReadyConditionLastTransitionTime(pod corev1.Pod) metav1.Time {
+	return getPodReadyCondition(pod.Status).LastTransitionTime
+}
+
 func podReady(pod corev1.Pod) bool {
 	return podReadyConditionTrue(pod.Status)
 }