Skip to content

Commit 8a85027

Browse files
committed
Merge remote-tracking branch 'origin/cohere' into sync/upstream-2026-04-16-a2c24858
2 parents a2c2485 + 6ab675b commit 8a85027

42 files changed

Lines changed: 570 additions & 20 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/peerpods-chart_image.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,25 @@ jobs:
155155
--password-stdin
156156
echo "Helm authenticated with ghcr.io"
157157
158+
- name: Authenticate to GCP
159+
if: ${{ contains(steps.registry.outputs.registry, 'docker.pkg.dev') }}
160+
uses: google-github-actions/auth@c200f3691d83b41bf9bbd8638997a462592937ed # v2.1.13
161+
with:
162+
workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
163+
service_account: ${{ vars.GCP_SERVICE_ACCOUNT }}
164+
165+
- name: Authenticate Helm with Artifact Registry
166+
if: ${{ contains(steps.registry.outputs.registry, 'docker.pkg.dev') }}
167+
env:
168+
REGISTRY: ${{ steps.registry.outputs.registry }}
169+
run: |
170+
AR_HOST=$(echo "${REGISTRY}" | cut -d'/' -f1)
171+
echo "Authenticating Helm with ${AR_HOST}..."
172+
gcloud auth print-access-token | helm registry login "${AR_HOST}" \
173+
--username oauth2accesstoken \
174+
--password-stdin
175+
echo "Helm authenticated with ${AR_HOST}"
176+
158177
- name: Update Helm dependencies
159178
run: |
160179
echo "Updating Helm dependencies..."

src/cloud-api-adaptor/install/charts/peerpods/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: v2
22
name: peerpods
33
description: Cloud API Adaptor (Peerpods) Helm Chart
44
type: application
5-
version: 0.1.4
5+
version: 0.1.4-cohere.1
66
appVersion: "v0.19.0"
77

88
keywords:

src/cloud-api-adaptor/install/charts/peerpods/providers/gcp.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ providerConfigs:
6666
# (default: "")
6767
# GCP_SUBNETWORK: ""
6868

69+
# Use Spot (preemptible) instances for peer pod VMs. Reduces cost significantly but VMs may be reclaimed by GCP at any time.
70+
# (default: "false")
71+
# GCP_USE_SPOT_INSTANCES: "false"
72+
6973
# Zone
7074
# (required)
7175
GCP_ZONE: ""

src/cloud-api-adaptor/install/charts/peerpods/templates/daemonset.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@ spec:
2525
serviceAccountName: cloud-api-adaptor
2626
containers:
2727
- command:
28+
{{- if .Values.command }}
29+
{{- toYaml .Values.command | nindent 8 }}
30+
{{- else }}
2831
- /usr/local/bin/entrypoint.sh
32+
{{- end }}
2933
env:
3034
- name: NODE_NAME
3135
valueFrom:
@@ -100,7 +104,7 @@ spec:
100104
hostNetwork: true
101105
dnsPolicy: ClusterFirstWithHostNet
102106
nodeSelector:
103-
node.kubernetes.io/worker: ""
107+
{{- toYaml .Values.nodeSelector | nindent 8 }}
104108
volumes:
105109
{{- if and (or (eq .Values.provider "libvirt") (eq .Values.provider "byom")) (include "peerpods.sshKeySecretName" .) }}
106110
- name: ssh
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
{{- if .Values.gkeNodeFix.enabled }}
2+
apiVersion: apps/v1
3+
kind: DaemonSet
4+
metadata:
5+
name: fix-gke-node-config
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
app: fix-gke-node-config
9+
spec:
10+
selector:
11+
matchLabels:
12+
app: fix-gke-node-config
13+
template:
14+
metadata:
15+
labels:
16+
app: fix-gke-node-config
17+
spec:
18+
tolerations:
19+
- operator: Exists
20+
hostPID: true
21+
nodeSelector:
22+
{{- toYaml .Values.nodeSelector | nindent 8 }}
23+
initContainers:
24+
- name: fix-containerd
25+
image: alpine:3.21
26+
securityContext:
27+
privileged: true
28+
volumeMounts:
29+
- name: host-containerd-config
30+
mountPath: /etc/containerd
31+
command:
32+
- /bin/sh
33+
- -c
34+
- |
35+
TOML=/etc/containerd/config.toml
36+
37+
if grep -q 'discard_unpacked_layers = false' "$TOML"; then
38+
echo "containerd: already patched, skipping"
39+
exit 0
40+
fi
41+
42+
if grep -q 'discard_unpacked_layers = true' "$TOML"; then
43+
sed -i 's/discard_unpacked_layers = true/discard_unpacked_layers = false/g' "$TOML"
44+
echo "containerd: patched $TOML"
45+
grep discard_unpacked_layers "$TOML"
46+
nsenter -t 1 -m -u -i -n -p -- systemctl restart containerd
47+
echo "containerd: restarted"
48+
else
49+
echo "containerd: discard_unpacked_layers setting not found, skipping restart"
50+
grep discard_unpacked_layers "$TOML" || true
51+
fi
52+
- name: fix-kubelet
53+
image: alpine:3.21
54+
securityContext:
55+
privileged: true
56+
volumeMounts:
57+
- name: host-kubelet-config
58+
mountPath: /home/kubernetes
59+
command:
60+
- /bin/sh
61+
- -c
62+
- |
63+
CONFIG=/home/kubernetes/kubelet-config.yaml
64+
DESIRED="{{ .Values.gkeNodeFix.runtimeRequestTimeout }}"
65+
66+
if grep -q "runtimeRequestTimeout: \"$DESIRED\"" "$CONFIG" 2>/dev/null; then
67+
echo "kubelet: already set to $DESIRED, skipping"
68+
exit 0
69+
fi
70+
71+
if grep -q "runtimeRequestTimeout:" "$CONFIG"; then
72+
sed -i "s/runtimeRequestTimeout:.*/runtimeRequestTimeout: \"$DESIRED\"/" "$CONFIG"
73+
else
74+
echo "runtimeRequestTimeout: \"$DESIRED\"" >> "$CONFIG"
75+
fi
76+
77+
echo "kubelet: patched $CONFIG"
78+
grep runtimeRequestTimeout "$CONFIG"
79+
80+
nsenter -t 1 -m -u -i -n -p -- systemctl restart kubelet
81+
echo "kubelet: restarted"
82+
containers:
83+
- name: pause
84+
image: registry.k8s.io/pause:3.10
85+
volumes:
86+
- name: host-containerd-config
87+
hostPath:
88+
path: /etc/containerd
89+
type: Directory
90+
- name: host-kubelet-config
91+
hostPath:
92+
path: /home/kubernetes
93+
type: Directory
94+
{{- end }}

src/cloud-api-adaptor/install/charts/peerpods/templates/rbac.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ kind: ServiceAccount
55
metadata:
66
name: cloud-api-adaptor
77
namespace: {{ .Release.Namespace }}
8+
{{- with .Values.serviceAccount.annotations }}
9+
annotations:
10+
{{- toYaml . | nindent 4 }}
11+
{{- end }}
812
---
913
apiVersion: rbac.authorization.k8s.io/v1
1014
kind: ClusterRole

src/cloud-api-adaptor/install/charts/peerpods/values.yaml

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,29 @@ secrets:
7979
#
8080
# Note: libvirt/docker provider files override this with the dev image.
8181
image:
82-
name: quay.io/confidential-containers/cloud-api-adaptor
83-
tag: "latest"
82+
name: us-central1-docker.pkg.dev/cohere-confidential-computing/cloud-api-adaptor/cloud-api-adaptor
83+
tag: "44d97551e74d3d835e6ab17d37475f1033b2011a"
8484

8585
# Cloud provider: libvirt, aws, azure, gcp, ibmcloud, vsphere
86-
provider: libvirt
86+
provider: gcp
87+
88+
# Override the DaemonSet container command. Set to bypass entrypoint.sh when
89+
# using Workload Identity (no GCP_CREDENTIALS file needed).
90+
command: ["/bin/sh", "-c", "exec cloud-api-adaptor gcp"]
91+
92+
# DaemonSet nodeSelector. GKE nodes are labeled by the node pool config.
93+
nodeSelector:
94+
fortress.cohere.com/caa-worker: "true"
95+
96+
# ServiceAccount annotations (e.g. Workload Identity binding)
97+
serviceAccount:
98+
annotations: {}
99+
100+
# GKE node config fixes. Deploys a separate DaemonSet that patches containerd
101+
# (discard_unpacked_layers) and kubelet (runtimeRequestTimeout) on each node.
102+
gkeNodeFix:
103+
enabled: true
104+
runtimeRequestTimeout: "15m"
87105

88106
# Maximum number of peer pods allowed to run simultaneously
89107
# This limit prevents resource exhaustion on the cloud provider
@@ -117,6 +135,9 @@ daemonset:
117135
# Configuration options documented in ../../../peerpod-ctrl/chart/values.yaml
118136
resourceCtrl:
119137
enabled: true
138+
image:
139+
repository: us-central1-docker.pkg.dev/cohere-confidential-computing/cloud-api-adaptor/peerpod-ctrl
140+
tag: "bfd5b9847b2fb44c843f6a3b6209092d0a217d83"
120141

121142
# peerpods-webhook subchart configuration
122143
# Mutating webhook that modifies pod specs to use peer pods runtime and resources

src/cloud-api-adaptor/pkg/util/agentproto/redirector.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"fmt"
1010
"net"
1111
"sync"
12+
"time"
1213

1314
"github.com/containerd/ttrpc"
1415
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols"
@@ -17,6 +18,12 @@ import (
1718
pb "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
1819
)
1920

21+
// statsTimeout bounds how long StatsContainer and GetMetrics RPCs may block.
22+
// Keeping this well under the metrics-server scrape interval (~10s) prevents a
23+
// single slow or unreachable peer-pod VM from stalling kubelet stats collection
24+
// for the entire node.
25+
const statsTimeout = 5 * time.Second
26+
2027
type Redirector interface {
2128
pb.AgentServiceService
2229
pb.HealthService
@@ -154,6 +161,8 @@ func (s *redirector) StatsContainer(ctx context.Context, req *pb.StatsContainerR
154161
if err := s.Connect(ctx); err != nil {
155162
return nil, err
156163
}
164+
ctx, cancel := context.WithTimeout(ctx, statsTimeout)
165+
defer cancel()
157166
return s.agentClient.StatsContainer(ctx, req)
158167
}
159168

@@ -281,6 +290,8 @@ func (s *redirector) GetMetrics(ctx context.Context, req *pb.GetMetricsRequest)
281290
if err := s.Connect(ctx); err != nil {
282291
return nil, err
283292
}
293+
ctx, cancel := context.WithTimeout(ctx, statsTimeout)
294+
defer cancel()
284295
return s.agentClient.GetMetrics(ctx, req)
285296
}
286297

@@ -393,6 +404,8 @@ func (s *redirector) GetVolumeStats(ctx context.Context, req *pb.VolumeStatsRequ
393404
if err := s.Connect(ctx); err != nil {
394405
return nil, err
395406
}
407+
ctx, cancel := context.WithTimeout(ctx, statsTimeout)
408+
defer cancel()
396409
return s.agentClient.GetVolumeStats(ctx, req)
397410
}
398411

src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ COPY mkosi.presets /image/mkosi.presets
3333
COPY mkosi.profiles /image/mkosi.profiles
3434
COPY mkosi.skeleton /image/mkosi.skeleton
3535
COPY mkosi.skeleton-debug /image/mkosi.skeleton-debug
36+
COPY mkosi.skeleton-debug-fedora /image/mkosi.skeleton-debug-fedora
3637
COPY mkosi.skeleton-sftp /image/mkosi.skeleton-sftp
3738
COPY mkosi.workspace /image/mkosi.workspace
3839
COPY resources /image/resources

src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
11
# syntax=docker/dockerfile:1.5.0-labs
2-
# ubuntu:24.04
3-
FROM ubuntu@sha256:0d39fcc8335d6d74d5502f6df2d30119ff4790ebbb60b364818d5112d9e3e932 AS builder
2+
# ubuntu:24.10 (Oracular) — provides systemd 256 EFI stub with TDX RTMR support
3+
FROM ubuntu@sha256:cdf755952ed117f6126ff4e65810bf93767d4c38f5c7185b50ec1f1078b464cc AS builder
44

55
ARG MKOSI_VERSION="v22"
66
ARG PROFILE="debug"
77
ARG IMAGE_VERSION="0.0.0"
88

99
ARG DEBIAN_FRONTEND=noninteractive
1010

11+
# 24.10 is EOL; redirect apt to old-releases mirror
12+
RUN sed -i 's|archive.ubuntu.com|old-releases.ubuntu.com|g; s|security.ubuntu.com|old-releases.ubuntu.com|g' \
13+
/etc/apt/sources.list.d/ubuntu.sources
14+
1115
RUN apt-get update && \
1216
apt-get install -y \
1317
bubblewrap \
18+
curl \
1419
git \
20+
gnupg \
1521
cpio \
1622
systemd-repart \
1723
kmod \
@@ -37,10 +43,39 @@ COPY mkosi.presets /image/mkosi.presets
3743
COPY mkosi.profiles /image/mkosi.profiles
3844
COPY mkosi.skeleton /image/mkosi.skeleton
3945
COPY mkosi.skeleton-debug /image/mkosi.skeleton-debug
46+
COPY mkosi.skeleton-debug-ubuntu /image/mkosi.skeleton-debug-ubuntu
4047
COPY mkosi.skeleton-sftp /image/mkosi.skeleton-sftp
4148
COPY mkosi.workspace /image/mkosi.workspace
4249
COPY resources /image/resources
4350
COPY mkosi.conf.ubuntu /image/mkosi.conf
51+
52+
# Add NVIDIA APT repos to mkosi.skeleton/ so they are present in the image tree
53+
# *before* package installation. mkosi applies SkeletonTrees before apt-get runs.
54+
RUN mkdir -p /image/mkosi.skeleton/etc/apt/sources.list.d \
55+
/image/mkosi.skeleton/etc/apt/preferences.d \
56+
/image/mkosi.skeleton/usr/share/keyrings \
57+
&& curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-archive-keyring.gpg \
58+
-o /image/mkosi.skeleton/usr/share/keyrings/cuda-archive-keyring.gpg \
59+
&& curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
60+
| gpg --dearmor -o /image/mkosi.skeleton/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
61+
&& echo 'deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/ /' \
62+
> /image/mkosi.skeleton/etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list \
63+
&& echo 'deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /' \
64+
> /image/mkosi.skeleton/etc/apt/sources.list.d/nvidia-container-toolkit.list \
65+
&& printf '%s\n' \
66+
'Package: nvidia-*' \
67+
'Pin: origin developer.download.nvidia.com' \
68+
'Pin-Priority: 1001' \
69+
'' \
70+
'Package: libnvidia-*' \
71+
'Pin: origin developer.download.nvidia.com' \
72+
'Pin-Priority: 1001' \
73+
'' \
74+
'Package: cuda-*' \
75+
'Pin: origin developer.download.nvidia.com' \
76+
'Pin-Priority: 1001' \
77+
> /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-cuda-repo
78+
4479
RUN --security=insecure mkosi --profile=$PROFILE --image-version=$IMAGE_VERSION
4580

4681
FROM scratch

0 commit comments

Comments
 (0)