Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
914 changes: 914 additions & 0 deletions .wordlist.txt

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ COPY --from=builder /opt/app-root/src/helm-charts-k8s/crds/deviceconfig-crd.yaml
/opt/app-root/src/helm-charts-k8s/charts/node-feature-discovery/crds/nfd-api-crds.yaml \
/opt/app-root/src/helm-charts-k8s/charts/kmm/crds/module-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/kmm/crds/nodemodulesconfig-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/clusterworkflowtemplate-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/cronworkflow-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflowartifactgctask-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflow-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workfloweventbinding-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflowtaskresult-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflowtaskset-crd.yaml \
/opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflowtemplate-crd.yaml \
/opt/helm-charts-crds-k8s/

RUN mkdir -p /remediation
Expand Down
25 changes: 18 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ YAML_FILES=bundle/manifests/amd-gpu-operator-node-metrics_rbac.authorization.k8s
CRD_YAML_FILES = deviceconfig-crd.yaml remediationworkflowstatus-crd.yaml
K8S_KMM_CRD_YAML_FILES=module-crd.yaml nodemodulesconfig-crd.yaml
DEFAULT_VALUES_FILES=helm-charts-k8s/values.yaml hack/k8s-patch/metadata-patch/values.yaml
REMEDIATION_CRD_YAML_FILES=clusterworkflowtemplate-crd.yaml cronworkflow-crd.yaml workflowartifactgctask-crd.yaml workflow-crd.yaml workfloweventbinding-crd.yaml workflowtaskresult-crd.yaml workflowtaskset-crd.yaml workflowtemplate-crd.yaml

GPU_OPERATOR_CHART ?= $(shell pwd)/helm-charts-k8s/gpu-operator-helm-k8s-$(PROJECT_VERSION).tgz
KUBECTL_CMD ?= kubectl
Expand All @@ -68,8 +69,12 @@ ifdef SKIP_INSTALL_DEFAULT_CR
SKIP_INSTALL_DEFAULT_CR_CMD=--set crds.defaultCR.install=false
endif

ifdef SKIP_REMEDIATION_CONTROLLER
SKIP_REMEDIATION_CONTROLLER_CMD=--set remediation.enabled=false
ifdef SKIP_REMEDIATION
SKIP_REMEDIATION_CMD=--set remediation.enabled=false
endif

ifdef SKIP_REMEDIATION_CRDS
SKIP_REMEDIATION_CRDS_CMD=--set remediation.installCRDs=false
endif

#################################
Expand Down Expand Up @@ -332,7 +337,7 @@ helm: ## Build helm charts for Kubernetes.
$(MAKE) helm-k8s

.PHONY: helm-k8s
helm-k8s: helmify manifests kustomize clean-helm gen-kmm-charts
helm-k8s: helmify manifests kustomize clean-helm gen-kmm-charts gen-remediation-charts
$(KUSTOMIZE) build config/default | $(HELMIFY) helm-charts-k8s
# Patching k8s helm chart metadata
cp $(shell pwd)/hack/k8s-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/
Expand All @@ -345,9 +350,7 @@ helm-k8s: helmify manifests kustomize clean-helm gen-kmm-charts
# Patching k8s helm chart kmm subchart
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/
cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/templates/
mkdir -p $(shell pwd)/helm-charts-k8s/charts/remediation/templates
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/templates/
cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation-crds/
cd $(shell pwd)/helm-charts-k8s; helm dependency update; helm lint .; cd ..;
mkdir $(shell pwd)/helm-charts-k8s/crds
echo "moving crd yaml files to crds folder"
Expand Down Expand Up @@ -585,7 +588,15 @@ endif
rm helm-charts-k8s/charts/kmm/templates/$$file; \
done

cert-manager-install: ## Deploy cert-manager.
gen-remediation-charts:
$(KUSTOMIZE) build $(shell pwd)/hack/k8s-patch/k8s-remediation-patch | $(HELMIFY) helm-charts-k8s/charts/remediation-crds
mkdir -p helm-charts-k8s/charts/remediation-crds/crds
@for file in $(REMEDIATION_CRD_YAML_FILES); do \
helm template amd-gpu helm-charts-k8s/charts/remediation-crds -s templates/$$file > helm-charts-k8s/charts/remediation-crds/crds/$$file; \
rm helm-charts-k8s/charts/remediation-crds/templates/$$file; \
done

cert-manager-install:
helm repo add jetstack https://charts.jetstack.io --force-update
helm install cert-manager jetstack/cert-manager --namespace cert-manager --create-namespace --version v1.15.1 --set crds.enabled=true

Expand Down
13 changes: 11 additions & 2 deletions api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ type RemediationWorkflowSpec struct {
Enable *bool `json:"enable,omitempty"`

// Name of the ConfigMap that holds condition-to-workflow mappings.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"}
ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"`
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Config",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:config"}
Config *v1.LocalObjectReference `json:"config,omitempty"`

// Time to live for argo workflow object and its pods for a failed workflow. Accepts duration strings like "30s", "4h", "24h". By default, it is set to 24h
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"}
Expand Down Expand Up @@ -125,6 +125,15 @@ type RemediationWorkflowSpec struct {
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeDrainPolicy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy"}
// +optional
NodeDrainPolicy *DrainSpec `json:"nodeDrainPolicy,omitempty"`

// AutoStartWorkflow specifies the behavior of the remediation workflow. Default value is true.
// If true, remediation workflow will be automatically started when the node condition matches.
// If false, remediation workflow will be in suspended state when the node condition matches and needs to be manually started by the user.
// This field gives users more control and flexibility on when to start the remediation workflow.
// Default value is set to true if not specified and the remediation workflow automatically starts when the node condition matches.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="AutoStartWorkflow",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:autoStartWorkflow"}
// +kubebuilder:default:=true
AutoStartWorkflow *bool `json:"autoStartWorkflow,omitempty"`
}

type RegistryTLS struct {
Expand Down
9 changes: 7 additions & 2 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 15 additions & 3 deletions bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -707,11 +707,23 @@ spec:
path: remediationWorkflow
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
- description: AutoStartWorkflow specifies the behavior of the remediation workflow.
Default value is true. If true, remediation workflow will be automatically
started when the node condition matches. If false, remediation workflow
will be in suspended state when the node condition matches and needs to
be manually started by the user. This field gives users more control and
flexibility on when to start the remediation workflow. Default value is
set to true if not specified and the remediation workflow automatically
starts when the node condition matches.
displayName: AutoStartWorkflow
path: remediationWorkflow.autoStartWorkflow
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:autoStartWorkflow
- description: Name of the ConfigMap that holds condition-to-workflow mappings.
displayName: ConditionalWorkflows
path: remediationWorkflow.conditionalWorkflows
displayName: Config
path: remediationWorkflow.config
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
- urn:alm:descriptor:com.amd.deviceconfigs:config
- description: enable remediation workflows. disabled by default enable if operator
should automatically handle remediation of node incase of gpu issues
displayName: Enable
Expand Down
11 changes: 10 additions & 1 deletion bundle/manifests/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1386,7 +1386,16 @@ spec:
remediationWorkflow:
description: remediation workflow
properties:
conditionalWorkflows:
autoStartWorkflow:
default: true
description: |-
AutoStartWorkflow specifies the behavior of the remediation workflow. Default value is true.
If true, remediation workflow will be automatically started when the node condition matches.
If false, remediation workflow will be in suspended state when the node condition matches and needs to be manually started by the user.
This field gives users more control and flexibility on when to start the remediation workflow.
Default value is set to true if not specified and the remediation workflow automatically starts when the node condition matches.
type: boolean
config:
description: Name of the ConfigMap that holds condition-to-workflow
mappings.
properties:
Expand Down
11 changes: 10 additions & 1 deletion config/crd/bases/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1382,7 +1382,16 @@ spec:
remediationWorkflow:
description: remediation workflow
properties:
conditionalWorkflows:
autoStartWorkflow:
default: true
description: |-
AutoStartWorkflow specifies the behavior of the remediation workflow. Default value is true.
If true, remediation workflow will be automatically started when the node condition matches.
If false, remediation workflow will be in suspended state when the node condition matches and needs to be manually started by the user.
This field gives users more control and flexibility on when to start the remediation workflow.
Default value is set to true if not specified and the remediation workflow automatically starts when the node condition matches.
type: boolean
config:
description: Name of the ConfigMap that holds condition-to-workflow
mappings.
properties:
Expand Down
18 changes: 15 additions & 3 deletions config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -678,11 +678,23 @@ spec:
path: remediationWorkflow
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
- description: AutoStartWorkflow specifies the behavior of the remediation workflow.
Default value is true. If true, remediation workflow will be automatically
started when the node condition matches. If false, remediation workflow
will be in suspended state when the node condition matches and needs to
be manually started by the user. This field gives users more control and
flexibility on when to start the remediation workflow. Default value is
set to true if not specified and the remediation workflow automatically
starts when the node condition matches.
displayName: AutoStartWorkflow
path: remediationWorkflow.autoStartWorkflow
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:autoStartWorkflow
- description: Name of the ConfigMap that holds condition-to-workflow mappings.
displayName: ConditionalWorkflows
path: remediationWorkflow.conditionalWorkflows
displayName: Config
path: remediationWorkflow.config
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
- urn:alm:descriptor:com.amd.deviceconfigs:config
- description: enable remediation workflows. disabled by default enable if operator
should automatically handle remediation of node incase of gpu issues
displayName: Enable
Expand Down
97 changes: 78 additions & 19 deletions docs/autoremediation/auto-remediation.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,29 +80,86 @@ The GPU Operator installs Argo Workflows v3.6.5, using a [customized installatio

The DeviceConfig Custom Resource includes a `RemediationWorkflowSpec` section for configuring and customizing the auto-remediation feature:

```golang
type RemediationWorkflowSpec struct {
Enable *bool

ConditionalWorkflows *v1.LocalObjectReference

TtlForFailedWorkflows int

TesterImage string

MaxParallelWorkflows int

NodeRemediationLabels map[string]string

NodeRemediationTaints []v1.Taint

NodeDrainPolicy *DrainSpec
}
```yaml
remediationWorkflow:
# Enable auto node remediation feature for AMD GPU Operator. Disabled by default.
# Set to true to activate automatic remediation workflows when GPU issues are detected.
enable: true

# ConfigMap containing mappings between node conditions and remediation workflows.
# If not specified, the operator uses the default 'default-conditional-workflow-mappings' ConfigMap.
# The ConfigMap defines which workflow template to execute for each specific error condition.
config:
name: configmapName

# Time-to-live duration for retaining failed workflow objects and pods before cleanup.
# Accepts duration strings like "5h", "24h", "30m", "1h30m". Default is 24 hours.
# Retaining failed workflows allows for post-mortem analysis and troubleshooting.
ttlForFailedWorkflows: 5h

# Container image used for executing GPU validation tests during remediation workflows.
# This image runs test suites to verify GPU health after remediation completes.
# Default image supports only RVS tests. Contact AMD for AGFHC-enabled test runner.
testerImage: docker.io/rocm/test-runner:v1.4.1

# Maximum number of remediation workflows that can execute concurrently across the cluster.
# Helps maintain minimum node availability by preventing excessive simultaneous remediations.
# A value of 0 (default) means no limit is enforced. Excess workflows are queued as Pending.
maxParallelWorkflows: 0

# Custom taints to apply to nodes during the remediation process.
# If not specified, the operator applies the default taint 'amd-gpu-unhealthy:NoSchedule'.
# Taints prevent new workload scheduling on affected nodes during remediation.
nodeRemediationTaints:
- key: # Taint key (e.g., 'amd-gpu-unhealthy')
value: # Taint value (e.g., specific error condition)
effect: # Taint effect (e.g., 'NoSchedule', 'NoExecute', 'PreferNoSchedule')

# Custom labels to apply to nodes during automatic remediation workflows.
# These labels persist throughout the remediation process and can be used for
# monitoring, tracking, or applying custom policies.
nodeRemediationLabels:
label-one-key: label-one-val
label-two-key: label-two-val

# Configuration for pod eviction behavior when draining workloads from nodes.
# Controls how pods are removed during remediation, including timeouts, grace periods,
# and namespace exclusions to protect critical infrastructure.
nodeDrainPolicy:
# Enable forced draining of pods that do not respond to standard termination signals.
# When true, pods that cannot be evicted gracefully will be forcibly removed.
force: false

# Maximum time in seconds to wait for the drain operation to complete.
# A value of 0 means infinite timeout. Default is 300 seconds (5 minutes).
timeoutSeconds: 300

# Grace period in seconds for pods to shut down gracefully after termination signal.
# Overrides each pod's terminationGracePeriodSeconds. Use -1 to respect pod settings.
gracePeriodSeconds: 60

# When true, DaemonSet-managed pods are excluded from the drain operation.
# DaemonSets are designed to run on all nodes and will automatically reschedule.
ignoreDaemonSets: true

# List of namespaces to exclude from pod eviction during drain operation.
# Pods in these namespaces remain on the node, allowing critical infrastructure
# components to continue operating throughout the remediation process.
ignoreNamespaces:
- kube-system
- cert-manager

# AutoStartWorkflow specifies the behavior of the remediation workflow. Default value is true.
# If true, remediation workflow will be automatically started when the node condition matches.
# If false, remediation workflow will be in suspended state when the node condition matches and needs to be manually started by the user.
# This field gives users more control and flexibility on when to start the remediation workflow.
# Default value is set to true if not specified and the remediation workflow automatically starts when the node condition matches.
autoStartWorkflow: true
```

**Enable** - Controls whether automatic node remediation is enabled. Set this field to `true` to activate the auto-remediation feature in the cluster.

**ConditionalWorkflows** - References a ConfigMap that contains mappings between node conditions and their corresponding remediation workflows. The GPU Operator automatically creates a `default-conditional-workflow-mappings` ConfigMap with predefined mappings. Users can either modify this default ConfigMap or create their own custom ConfigMap. If left empty, the default ConfigMap will be used automatically. More about the ConfigMap in [below section](auto-remediation.md#remediation-workflow-configmap).
**Config** - References a ConfigMap that contains mappings between node conditions and their corresponding remediation workflows. The GPU Operator automatically creates a `default-conditional-workflow-mappings` ConfigMap with predefined mappings. Users can either modify this default ConfigMap or create their own custom ConfigMap. If left empty, the default ConfigMap will be used automatically. More about the ConfigMap in [below section](auto-remediation.md#remediation-workflow-configmap).

> **Note:** The `default-conditional-workflow-mappings` ConfigMap is created automatically by the GPU Operator.

Expand All @@ -122,6 +179,8 @@ When the number of triggered workflows exceeds this limit, additional workflows

**NodeDrainPolicy** - Configures the pod eviction behavior when draining workloads from nodes during the remediation process. This policy controls how pods are removed, including timeout settings, grace periods, and namespace exclusions. See the [Node Drain Policy Configuration](#node-drain-policy-configuration) section below for detailed field descriptions.

**AutoStartWorkflow** - Specifies the behavior of the remediation workflow. Default value is `true`. If `true`, the remediation workflow is automatically started when the node condition matches. If `false`, the remediation workflow remains in a suspended state when the node condition matches and must be manually started by the user. To resume the workflow at a later point, refer to the [resume workflow section](#resuming-a-paused-workflow)

**Spec.CommonConfig.UtilsContainer** - Remediation workflow uses a utility image for executing the steps. Specify the utility image in `Spec.CommonConfig.UtilsContainer` section of Device Config. If the UtilsContainer section is not specified, default image used is `docker.io/rocm/gpu-operator-utils:latest`

#### Node Drain Policy Configuration
Expand Down
Loading