ROCm · sajmera-pensando · Feb 23, 2026 · Feb 17, 2026 · Feb 19, 2026 · Feb 20, 2026
diff --git a/.wordlist.txt b/.wordlist.txt
diff --git a/Dockerfile b/Dockerfile
@@ -54,6 +54,14 @@ COPY --from=builder /opt/app-root/src/helm-charts-k8s/crds/deviceconfig-crd.yaml
     /opt/app-root/src/helm-charts-k8s/charts/node-feature-discovery/crds/nfd-api-crds.yaml \
     /opt/app-root/src/helm-charts-k8s/charts/kmm/crds/module-crd.yaml \
     /opt/app-root/src/helm-charts-k8s/charts/kmm/crds/nodemodulesconfig-crd.yaml \
+    /opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/clusterworkflowtemplate-crd.yaml \
+    /opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/cronworkflow-crd.yaml \
+    /opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflowartifactgctask-crd.yaml \
+    /opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflow-crd.yaml \
+    /opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workfloweventbinding-crd.yaml \
+    /opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflowtaskresult-crd.yaml \
+    /opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflowtaskset-crd.yaml \
+    /opt/app-root/src/helm-charts-k8s/charts/remediation-crds/crds/workflowtemplate-crd.yaml \
     /opt/helm-charts-crds-k8s/
 
 RUN mkdir -p /remediation

diff --git a/Makefile b/Makefile
@@ -48,6 +48,7 @@ YAML_FILES=bundle/manifests/amd-gpu-operator-node-metrics_rbac.authorization.k8s
 CRD_YAML_FILES = deviceconfig-crd.yaml remediationworkflowstatus-crd.yaml
 K8S_KMM_CRD_YAML_FILES=module-crd.yaml nodemodulesconfig-crd.yaml
 DEFAULT_VALUES_FILES=helm-charts-k8s/values.yaml hack/k8s-patch/metadata-patch/values.yaml
+REMEDIATION_CRD_YAML_FILES=clusterworkflowtemplate-crd.yaml cronworkflow-crd.yaml workflowartifactgctask-crd.yaml workflow-crd.yaml workfloweventbinding-crd.yaml workflowtaskresult-crd.yaml workflowtaskset-crd.yaml workflowtemplate-crd.yaml
 
 GPU_OPERATOR_CHART ?= $(shell pwd)/helm-charts-k8s/gpu-operator-helm-k8s-$(PROJECT_VERSION).tgz
 KUBECTL_CMD ?= kubectl
@@ -68,8 +69,12 @@ ifdef SKIP_INSTALL_DEFAULT_CR
 	SKIP_INSTALL_DEFAULT_CR_CMD=--set crds.defaultCR.install=false
 endif
 
-ifdef SKIP_REMEDIATION_CONTROLLER
-	SKIP_REMEDIATION_CONTROLLER_CMD=--set remediation.enabled=false
+ifdef SKIP_REMEDIATION
+	SKIP_REMEDIATION_CMD=--set remediation.enabled=false
+endif
+
+ifdef SKIP_REMEDIATION_CRDS
+	SKIP_REMEDIATION_CRDS_CMD=--set remediation.installCRDs=false
 endif
 
 #################################
@@ -332,7 +337,7 @@ helm: ## Build helm charts for Kubernetes.
 	$(MAKE) helm-k8s
 
 .PHONY: helm-k8s
-helm-k8s: helmify manifests kustomize clean-helm gen-kmm-charts
+helm-k8s: helmify manifests kustomize clean-helm gen-kmm-charts gen-remediation-charts
 	$(KUSTOMIZE) build config/default | $(HELMIFY) helm-charts-k8s
 	# Patching k8s helm chart metadata
 	cp $(shell pwd)/hack/k8s-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/
@@ -345,9 +350,7 @@ helm-k8s: helmify manifests kustomize clean-helm gen-kmm-charts
 	# Patching k8s helm chart kmm subchart
 	cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/
 	cp $(shell pwd)/hack/k8s-patch/k8s-kmm-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/kmm/templates/
-	mkdir -p $(shell pwd)/helm-charts-k8s/charts/remediation/templates
-	cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/
-	cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/template-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation/templates/
+	cp $(shell pwd)/hack/k8s-patch/k8s-remediation-patch/metadata-patch/*.yaml $(shell pwd)/helm-charts-k8s/charts/remediation-crds/
 	cd $(shell pwd)/helm-charts-k8s; helm dependency update; helm lint .; cd ..;
 	mkdir $(shell pwd)/helm-charts-k8s/crds
 	echo "moving crd yaml files to crds folder"
@@ -585,7 +588,15 @@ endif
 		rm helm-charts-k8s/charts/kmm/templates/$$file; \
 	done
 
-cert-manager-install: ## Deploy cert-manager.
+gen-remediation-charts:
+	$(KUSTOMIZE) build $(shell pwd)/hack/k8s-patch/k8s-remediation-patch | $(HELMIFY) helm-charts-k8s/charts/remediation-crds
+	mkdir -p helm-charts-k8s/charts/remediation-crds/crds
+	@for file in $(REMEDIATION_CRD_YAML_FILES); do \
+		helm template amd-gpu helm-charts-k8s/charts/remediation-crds -s templates/$$file > helm-charts-k8s/charts/remediation-crds/crds/$$file; \
+		rm helm-charts-k8s/charts/remediation-crds/templates/$$file; \
+	done
+
+cert-manager-install:
 	helm repo add jetstack https://charts.jetstack.io --force-update
 	helm install cert-manager jetstack/cert-manager --namespace cert-manager --create-namespace --version v1.15.1 --set crds.enabled=true
 

diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
@@ -90,8 +90,8 @@ type RemediationWorkflowSpec struct {
 	Enable *bool `json:"enable,omitempty"`
 
 	// Name of the ConfigMap that holds condition-to-workflow mappings.
-	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="ConditionalWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows"}
-	ConditionalWorkflows *v1.LocalObjectReference `json:"conditionalWorkflows,omitempty"`
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Config",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:config"}
+	Config *v1.LocalObjectReference `json:"config,omitempty"`
 
 	// Time to live for argo workflow object and its pods for a failed workflow. Accepts duration strings like "30s", "4h", "24h". By default, it is set to 24h
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="TtlForFailedWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:ttlForFailedWorkflows"}
@@ -125,6 +125,15 @@ type RemediationWorkflowSpec struct {
 	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="NodeDrainPolicy",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:nodeDrainPolicy"}
 	// +optional
 	NodeDrainPolicy *DrainSpec `json:"nodeDrainPolicy,omitempty"`
+
+	// AutoStartWorkflow specifies the behavior of the remediation workflow. Default value is true.
+	// If true, remediation workflow will be automatically started when the node condition matches.
+	// If false, remediation workflow will be in suspended state when the node condition matches and needs to be manually started by the user.
+	// This field gives users more control and flexibility on when to start the remediation workflow.
+	// Default value is set to true if not specified and the remediation workflow automatically starts when the node condition matches.
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="AutoStartWorkflow",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:autoStartWorkflow"}
+	// +kubebuilder:default:=true
+	AutoStartWorkflow *bool `json:"autoStartWorkflow,omitempty"`
 }
 
 type RegistryTLS struct {

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -707,11 +707,23 @@ spec:
         path: remediationWorkflow
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
+      - description: AutoStartWorkflow specifies the behavior of the remediation workflow.
+          Default value is true. If true, remediation workflow will be automatically
+          started when the node condition matches. If false, remediation workflow
+          will be in suspended state when the node condition matches and needs to
+          be manually started by the user. This field gives users more control and
+          flexibility on when to start the remediation workflow. Default value is
+          set to true if not specified and the remediation workflow automatically
+          starts when the node condition matches.
+        displayName: AutoStartWorkflow
+        path: remediationWorkflow.autoStartWorkflow
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:autoStartWorkflow
       - description: Name of the ConfigMap that holds condition-to-workflow mappings.
-        displayName: ConditionalWorkflows
-        path: remediationWorkflow.conditionalWorkflows
+        displayName: Config
+        path: remediationWorkflow.config
         x-descriptors:
-        - urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
+        - urn:alm:descriptor:com.amd.deviceconfigs:config
       - description: enable remediation workflows. disabled by default enable if operator
           should automatically handle remediation of node incase of gpu issues
         displayName: Enable

diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -1386,7 +1386,16 @@ spec:
               remediationWorkflow:
                 description: remediation workflow
                 properties:
-                  conditionalWorkflows:
+                  autoStartWorkflow:
+                    default: true
+                    description: |-
+                      AutoStartWorkflow specifies the behavior of the remediation workflow. Default value is true.
+                      If true, remediation workflow will be automatically started when the node condition matches.
+                      If false, remediation workflow will be in suspended state when the node condition matches and needs to be manually started by the user.
+                      This field gives users more control and flexibility on when to start the remediation workflow.
+                      Default value is set to true if not specified and the remediation workflow automatically starts when the node condition matches.
+                    type: boolean
+                  config:
                     description: Name of the ConfigMap that holds condition-to-workflow
                       mappings.
                     properties:

diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -1382,7 +1382,16 @@ spec:
               remediationWorkflow:
                 description: remediation workflow
                 properties:
-                  conditionalWorkflows:
+                  autoStartWorkflow:
+                    default: true
+                    description: |-
+                      AutoStartWorkflow specifies the behavior of the remediation workflow. Default value is true.
+                      If true, remediation workflow will be automatically started when the node condition matches.
+                      If false, remediation workflow will be in suspended state when the node condition matches and needs to be manually started by the user.
+                      This field gives users more control and flexibility on when to start the remediation workflow.
+                      Default value is set to true if not specified and the remediation workflow automatically starts when the node condition matches.
+                    type: boolean
+                  config:
                     description: Name of the ConfigMap that holds condition-to-workflow
                       mappings.
                     properties:

diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -678,11 +678,23 @@ spec:
         path: remediationWorkflow
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:remediationWorkflow
+      - description: AutoStartWorkflow specifies the behavior of the remediation workflow.
+          Default value is true. If true, remediation workflow will be automatically
+          started when the node condition matches. If false, remediation workflow
+          will be in suspended state when the node condition matches and needs to
+          be manually started by the user. This field gives users more control and
+          flexibility on when to start the remediation workflow. Default value is
+          set to true if not specified and the remediation workflow automatically
+          starts when the node condition matches.
+        displayName: AutoStartWorkflow
+        path: remediationWorkflow.autoStartWorkflow
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:autoStartWorkflow
       - description: Name of the ConfigMap that holds condition-to-workflow mappings.
-        displayName: ConditionalWorkflows
-        path: remediationWorkflow.conditionalWorkflows
+        displayName: Config
+        path: remediationWorkflow.config
         x-descriptors:
-        - urn:alm:descriptor:com.amd.deviceconfigs:conditionalWorkflows
+        - urn:alm:descriptor:com.amd.deviceconfigs:config
       - description: enable remediation workflows. disabled by default enable if operator
           should automatically handle remediation of node incase of gpu issues
         displayName: Enable

diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md
@@ -80,29 +80,86 @@ The GPU Operator installs Argo Workflows v3.6.5, using a [customized installatio
 
 The DeviceConfig Custom Resource includes a `RemediationWorkflowSpec` section for configuring and customizing the auto-remediation feature:
 
-```golang
-type RemediationWorkflowSpec struct {
-  Enable *bool
-
-  ConditionalWorkflows *v1.LocalObjectReference
-
-  TtlForFailedWorkflows int
-
-  TesterImage string
-
-  MaxParallelWorkflows int
-
-  NodeRemediationLabels map[string]string
-
-  NodeRemediationTaints []v1.Taint
-
-  NodeDrainPolicy *DrainSpec
-}
+```yaml
+remediationWorkflow:
+  # Enable auto node remediation feature for AMD GPU Operator. Disabled by default.
+  # Set to true to activate automatic remediation workflows when GPU issues are detected.
+  enable: true
+
+  # ConfigMap containing mappings between node conditions and remediation workflows.
+  # If not specified, the operator uses the default 'default-conditional-workflow-mappings' ConfigMap.
+  # The ConfigMap defines which workflow template to execute for each specific error condition.
+  config:
+    name: configmapName
+
+  # Time-to-live duration for retaining failed workflow objects and pods before cleanup.
+  # Accepts duration strings like "5h", "24h", "30m", "1h30m". Default is 24 hours.
+  # Retaining failed workflows allows for post-mortem analysis and troubleshooting.
+  ttlForFailedWorkflows: 5h
+
+  # Container image used for executing GPU validation tests during remediation workflows.
+  # This image runs test suites to verify GPU health after remediation completes.
+  # Default image supports only RVS tests. Contact AMD for AGFHC-enabled test runner.
+  testerImage: docker.io/rocm/test-runner:v1.4.1
+
+  # Maximum number of remediation workflows that can execute concurrently across the cluster.
+  # Helps maintain minimum node availability by preventing excessive simultaneous remediations.
+  # A value of 0 (default) means no limit is enforced. Excess workflows are queued as Pending.
+  maxParallelWorkflows: 0
+
+  # Custom taints to apply to nodes during the remediation process.
+  # If not specified, the operator applies the default taint 'amd-gpu-unhealthy:NoSchedule'.
+  # Taints prevent new workload scheduling on affected nodes during remediation.
+  nodeRemediationTaints:
+    - key:       # Taint key (e.g., 'amd-gpu-unhealthy')
+      value:     # Taint value (e.g., specific error condition)
+      effect:    # Taint effect (e.g., 'NoSchedule', 'NoExecute', 'PreferNoSchedule')
+
+  # Custom labels to apply to nodes during automatic remediation workflows.
+  # These labels persist throughout the remediation process and can be used for
+  # monitoring, tracking, or applying custom policies.
+  nodeRemediationLabels:
+    label-one-key: label-one-val
+    label-two-key: label-two-val
+
+  # Configuration for pod eviction behavior when draining workloads from nodes.
+  # Controls how pods are removed during remediation, including timeouts, grace periods,
+  # and namespace exclusions to protect critical infrastructure.
+  nodeDrainPolicy:
+    # Enable forced draining of pods that do not respond to standard termination signals.
+    # When true, pods that cannot be evicted gracefully will be forcibly removed.
+    force: false
+
+    # Maximum time in seconds to wait for the drain operation to complete.
+    # A value of 0 means infinite timeout. Default is 300 seconds (5 minutes).
+    timeoutSeconds: 300
+
+    # Grace period in seconds for pods to shut down gracefully after termination signal.
+    # Overrides each pod's terminationGracePeriodSeconds. Use -1 to respect pod settings.
+    gracePeriodSeconds: 60
+
+    # When true, DaemonSet-managed pods are excluded from the drain operation.
+    # DaemonSets are designed to run on all nodes and will automatically reschedule.
+    ignoreDaemonSets: true
+
+    # List of namespaces to exclude from pod eviction during drain operation.
+    # Pods in these namespaces remain on the node, allowing critical infrastructure
+    # components to continue operating throughout the remediation process.
+    ignoreNamespaces:
+      - kube-system
+      - cert-manager
+
+  # AutoStartWorkflow specifies the behavior of the remediation workflow. Default value is true.
+  # If true, remediation workflow will be automatically started when the node condition matches.
+  # If false, remediation workflow will be in suspended state when the node condition matches and needs to be manually started by the user.
+  # This field gives users more control and flexibility on when to start the remediation workflow.
+  # Default value is set to true if not specified and the remediation workflow automatically starts when the node condition matches.
+  autoStartWorkflow: true
 ```
 
 **Enable** - Controls whether automatic node remediation is enabled. Set this field to `true` to activate the auto-remediation feature in the cluster.
 
-**ConditionalWorkflows** - References a ConfigMap that contains mappings between node conditions and their corresponding remediation workflows. The GPU Operator automatically creates a `default-conditional-workflow-mappings` ConfigMap with predefined mappings. Users can either modify this default ConfigMap or create their own custom ConfigMap. If left empty, the default ConfigMap will be used automatically. More about the ConfigMap in [below section](auto-remediation.md#remediation-workflow-configmap).
+**Config** - References a ConfigMap that contains mappings between node conditions and their corresponding remediation workflows. The GPU Operator automatically creates a `default-conditional-workflow-mappings` ConfigMap with predefined mappings. Users can either modify this default ConfigMap or create their own custom ConfigMap. If left empty, the default ConfigMap will be used automatically. More about the ConfigMap in [below section](auto-remediation.md#remediation-workflow-configmap).
 
 > **Note:** The `default-conditional-workflow-mappings` ConfigMap is created automatically by the GPU Operator.
 
@@ -122,6 +179,8 @@ When the number of triggered workflows exceeds this limit, additional workflows
 
 **NodeDrainPolicy** - Configures the pod eviction behavior when draining workloads from nodes during the remediation process. This policy controls how pods are removed, including timeout settings, grace periods, and namespace exclusions. See the [Node Drain Policy Configuration](#node-drain-policy-configuration) section below for detailed field descriptions.
 
+**AutoStartWorkflow** - Specifies the behavior of the remediation workflow. Default value is `true`. If `true`, the remediation workflow is automatically started when the node condition matches. If `false`, the remediation workflow remains in a suspended state when the node condition matches and must be manually started by the user. To resume the workflow at a later point, refer to the [resume workflow section](#resuming-a-paused-workflow)
+
 **Spec.CommonConfig.UtilsContainer** - Remediation workflow uses a utility image for executing the steps. Specify the utility image in `Spec.CommonConfig.UtilsContainer` section of Device Config. If the UtilsContainer section is not specified, default image used is `docker.io/rocm/gpu-operator-utils:latest`
 
 #### Node Drain Policy Configuration