-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-55075][K8S] Track executor pod creation errors with ExecutorFailureTracker #53840
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -118,6 +118,9 @@ class ExecutorPodsAllocator( | |
|
|
||
| protected val numOutstandingPods = new AtomicInteger() | ||
|
|
||
| // Track total failed pod creation attempts across the application lifecycle | ||
| protected val totalFailedPodCreations = new AtomicInteger(0) | ||
|
|
||
| protected var lastSnapshot = ExecutorPodsSnapshot() | ||
|
|
||
| protected var appId: String = _ | ||
|
|
@@ -459,32 +462,55 @@ class ExecutorPodsAllocator( | |
| .build() | ||
| val resources = replacePVCsIfNeeded( | ||
| podWithAttachedContainer, resolvedExecutorSpec.executorKubernetesResources, reusablePVCs) | ||
| val createdExecutorPod = | ||
| kubernetesClient.pods().inNamespace(namespace).resource(podWithAttachedContainer).create() | ||
| try { | ||
| addOwnerReference(createdExecutorPod, resources) | ||
| resources | ||
| .filter(_.getKind == "PersistentVolumeClaim") | ||
| .foreach { resource => | ||
| if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) { | ||
| addOwnerReference(driverPod.get, Seq(resource)) | ||
| } | ||
| val pvc = resource.asInstanceOf[PersistentVolumeClaim] | ||
| logInfo(log"Trying to create PersistentVolumeClaim " + | ||
| log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)} with " + | ||
| log"StorageClass ${MDC(LogKeys.CLASS_NAME, pvc.getSpec.getStorageClassName)}") | ||
| kubernetesClient.persistentVolumeClaims().inNamespace(namespace).resource(pvc).create() | ||
| PVC_COUNTER.incrementAndGet() | ||
| } | ||
| newlyCreatedExecutors(newExecutorId) = (resourceProfileId, clock.getTimeMillis()) | ||
| logDebug(s"Requested executor with id $newExecutorId from Kubernetes.") | ||
| val optCreatedExecutorPod = try { | ||
| Some(kubernetesClient | ||
| .pods() | ||
| .inNamespace(namespace) | ||
| .resource(podWithAttachedContainer) | ||
| .create()) | ||
| } catch { | ||
| case NonFatal(e) => | ||
| kubernetesClient.pods() | ||
| .inNamespace(namespace) | ||
| .resource(createdExecutorPod) | ||
| .delete() | ||
| throw e | ||
| // Register failure with global tracker if lifecycle manager is available | ||
| val failureCount = registerPodCreationFailure() | ||
| logError(log"Failed to create executor pod ${MDC(LogKeys.EXECUTOR_ID, newExecutorId)}. " + | ||
| log"Total failures: ${MDC(LogKeys.TOTAL, failureCount)}", e) | ||
| None | ||
| } | ||
| optCreatedExecutorPod.foreach { createdExecutorPod => | ||
| try { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why an error during pod creation handled differently from an error coming during adding the owner reference and creating PVC? In both case the nonfatal error ends in deleting the pod so why the 2nd case does not tracked as a failure?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Errors from owner reference and PVC creation are handled differently it appears (since they throw an exception). I've added them to the tracking by the lifecycle manager but also retained the current behaviour of throwing an exception for these cases.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually I have missed that line. Could you please run a manual test and check what happens with the exception which thrown here (by throwing an exception here directly and running one of the integration test)?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the exception being thrown is fine. It. is propagated up through |
||
| addOwnerReference(createdExecutorPod, resources) | ||
| resources | ||
| .filter(_.getKind == "PersistentVolumeClaim") | ||
| .foreach { resource => | ||
| if (conf.get(KUBERNETES_DRIVER_OWN_PVC) && driverPod.nonEmpty) { | ||
| addOwnerReference(driverPod.get, Seq(resource)) | ||
| } | ||
| val pvc = resource.asInstanceOf[PersistentVolumeClaim] | ||
| logInfo(log"Trying to create PersistentVolumeClaim " + | ||
| log"${MDC(LogKeys.PVC_METADATA_NAME, pvc.getMetadata.getName)} with " + | ||
| log"StorageClass ${MDC(LogKeys.CLASS_NAME, pvc.getSpec.getStorageClassName)}") | ||
| kubernetesClient | ||
| .persistentVolumeClaims() | ||
| .inNamespace(namespace) | ||
| .resource(pvc) | ||
| .create() | ||
| PVC_COUNTER.incrementAndGet() | ||
| } | ||
| newlyCreatedExecutors(newExecutorId) = (resourceProfileId, clock.getTimeMillis()) | ||
| logDebug(s"Requested executor with id $newExecutorId from Kubernetes.") | ||
| } catch { | ||
| case NonFatal(e) => | ||
| // Register failure with global tracker if lifecycle manager is available | ||
| val failureCount = registerPodCreationFailure() | ||
| logError(log"Failed to add owner reference or create PVC for executor pod " + | ||
| log"${MDC(LogKeys.EXECUTOR_ID, newExecutorId)}. " + | ||
| log"Total failures: ${MDC(LogKeys.TOTAL, failureCount)}", e) | ||
| kubernetesClient.pods() | ||
| .inNamespace(namespace) | ||
| .resource(createdExecutorPod) | ||
| .delete() | ||
| throw e | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -521,6 +547,16 @@ class ExecutorPodsAllocator( | |
| resources.filterNot(replacedResources.contains) | ||
| } | ||
|
|
||
| /** | ||
| * Registers a pod creation failure with the lifecycle manager and increments the local counter. | ||
| * Returns the total failure count for logging purposes. | ||
| */ | ||
| protected def registerPodCreationFailure(): Int = { | ||
| val failureCount = totalFailedPodCreations.incrementAndGet() | ||
| executorPodsLifecycleManager.foreach(_.registerExecutorFailure()) | ||
| failureCount | ||
| } | ||
|
|
||
| protected def isExecutorIdleTimedOut(state: ExecutorPodState, currentTime: Long): Boolean = { | ||
| try { | ||
| val creationTime = Instant.parse(state.pod.getMetadata.getCreationTimestamp).toEpochMilli() | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Line 473-479 seems to repeated twice here and 506-513. Could you make a method to remove the code duplication?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done