llvm · boomanaiden154 · Nov 29, 2024 · Nov 29, 2024
diff --git a/premerge/.gitignore b/premerge/.gitignore
@@ -0,0 +1 @@
+.terraform*
diff --git a/premerge/README.md b/premerge/README.md
@@ -0,0 +1,31 @@
+# Premerge Infrastructure
+
+This folder contains the terraform configuration files that define the GCP
+resources used to run the premerge checks. Currently, only Google employees
+with access to the GCP project where these checks are hosted are able to apply
+changes. Pull requests from anyone are still welcome.
+
+## Setup
+
+- install terraform (https://developer.hashicorp.com/terraform/install?product_intent=terraform)
+- get the GCP tokens: `gcloud auth application-default login`
+- initialize terraform: `terraform init`
+
+To apply any changes to the cluster:
+- setup the cluster: `terraform apply`
+- terraform will list the list of proposed changes.
+- enter 'yes' when prompted.
+
+## Setting the cluster up for the first time
+
+```
+terraform apply -target google_container_node_pool.llvm_premerge_linux_service
+terraform apply -target google_container_node_pool.llvm_premerge_linux
+terraform apply -target google_container_node_pool.llvm_premerge_windows
+terraform apply
+```
+
+Setting the cluster up for the first time is more involved as there are certain
+resources where terraform is unable to handle explicit dependencies. This means
+that we have to set up the GKE cluster before we setup any of the Kubernetes
+resources as otherwise the Terraform Kubernetes provider will error out.
diff --git a/premerge/backend.tf b/premerge/backend.tf
@@ -0,0 +1,5 @@
+terraform {
+  backend "gcs" {
+    bucket = "3772b2f502380a18-terraform-remote-backend"
+  }
+}
diff --git a/premerge/grafana_values.yaml b/premerge/grafana_values.yaml
@@ -0,0 +1,41 @@
+metrics:
+  enabled: true
+  alloy:
+    metricsTuning:
+      useIntegrationAllowList: true
+  cost:
+    enabled: true
+  kepler:
+    enabled: true
+  node-exporter:
+    enabled: true
+logs:
+  enabled: true
+  pod_logs:
+    enabled: true
+  cluster_events:
+    enabled: true
+traces:
+  enabled: true
+receivers:
+  grpc:
+    enabled: true
+  http:
+    enabled: true
+  zipkin:
+    enabled: true
+  grafanaCloudMetrics:
+    enabled: false
+opencost:
+  enabled: true
+kube-state-metrics:
+  enabled: true
+prometheus-node-exporter:
+  enabled: true
+prometheus-operator-crds:
+  enabled: true
+kepler:
+  enabled: true
+alloy: {}
+alloy-events: {}
+alloy-logs: {}
diff --git a/premerge/linux_container_pod_template.yaml b/premerge/linux_container_pod_template.yaml
@@ -0,0 +1,26 @@
+spec:
+  tolerations:
+    - key: "premerge-platform"
+      operator: "Equal"
+      value: "linux"
+      effect: "NoSchedule"
+  nodeSelector:
+    premerge-platform: linux
+  containers:
+  - name: $job
+    resources:
+      # The container is always scheduled on the same pod as the runner.
+      # Since we use the runner requests.cpu for scheduling/autoscaling,
+      # the request here should be set to something small.
+      #
+      # The limit however should be the number of cores of the node. Any limit
+      # inferior to the number of core could slow down the job.
+      #
+      # For memory however, the request/limits shall be correct.
+      # It's not used for scheduling, but can be used by k8 for OOM kill.
+      requests:
+        cpu: "100m"
+        memory: "50Gi"
+      limits:
+        cpu: 56
+        memory: "100Gi"
diff --git a/premerge/linux_runners_values.yaml b/premerge/linux_runners_values.yaml
@@ -0,0 +1,74 @@
+githubConfigUrl: "https://github.com/llvm"
+githubConfigSecret: "github-token"
+
+minRunners: 0
+maxRunners: 4
+
+containerMode:
+  type: "kubernetes"
+  kubernetesModeWorkVolumeClaim:
+    accessModes: ["ReadWriteOnce"]
+    storageClassName: "standard-rwo"
+    resources:
+      requests:
+        storage: "100Gi"
+  kubernetesModeServiceAccount:
+    annotations:
+
+template:
+  spec:
+    tolerations:
+    - key: "premerge-platform"
+      operator: "Equal"
+      value: "linux"
+      effect: "NoSchedule"
+    nodeSelector:
+      premerge-platform: linux
+    containers:
+    - name: runner
+      image: ghcr.io/actions/actions-runner:latest
+      command: ["/home/runner/run.sh"]
+      resources:
+        # The container will be scheduled on the same node as this runner.
+        # This means if we don't set the CPU request high-enough here, 2
+        # containers will be scheduled on the same pod, meaning 2 jobs.
+        #
+        # This number should be:
+        #  - greater than number_of_cores / 2:
+        #    A value lower than that could allow the scheduler to put 2
+        #    runners in the same pod. Meaning 2 containers in the same pod.
+        #    Meaning 2 jobs sharing the resources.
+        #  - lower than number_of_cores:
+        #    Each pod has some basic services running (metrics for ex). Those
+        #    already require some amount of CPU (~0.5). This means we don't
+        #    exactly have N cores to allocate, but N - epsilon.
+        #
+        # Memory however shall be handled at the container level. The runner
+        # itself doesn't need much, just using something enough not to get
+        # OOM killed.
+        requests:
+          cpu: 50
+          memory: "2Gi"
+        limits:
+          cpu: 56
+          memory: "2Gi"
+      env:
+        - name: ACTIONS_RUNNER_CONTAINER_HOOKS
+          value: /home/runner/k8s/index.js
+        - name: ACTIONS_RUNNER_POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: ACTIONS_RUNNER_REQUIRE_JOB_CONTAINER
+          value: "true"
+        - name: ACTIONS_RUNNER_CONTAINER_HOOK_TEMPLATE
+          value: "/home/runner/pod-config/linux-container-pod-template.yaml"
+      volumeMounts:
+        - name: container-pod-config
+          mountPath: /home/runner/pod-config
+    securityContext:
+      fsGroup: 123
+    volumes:
+      - name: container-pod-config
+        configMap:
+          name: linux-container-pod-template