Aggregate polling for retrieving job pending reason

vkarak · vkarak · commit 61c38aef5537 · 2026-03-25T17:21:45.000+01:00
diff --git a/docs/config_reference.rst b/docs/config_reference.rst
@@ -438,12 +438,14 @@ System Partition Configuration
 
    Ignore the ``ReqNodeNotAvail`` Slurm state.
 
-   If a job associated to a test is in pending state with the Slurm reason ``ReqNodeNotAvail`` and a list of unavailable nodes is also specified, ReFrame will check the status of the nodes and, if all of them are indeed down, it will cancel the job.
-   Sometimes, however, when Slurm's backfill algorithm takes too long to compute, Slurm will set the pending reason to ``ReqNodeNotAvail`` and mark all system nodes as unavailable, causing ReFrame to kill the job.
+   If a job associated to a test is in pending state with the Slurm reason ``ReqNodeNotAvail``, ReFrame will cancel the job.
+   Sometimes, however, when Slurm's backfill algorithm takes too long to compute, Slurm will set the pending reason to ``ReqNodeNotAvail`` and mark all system nodes as unavailable, causing ReFrame to its jobs.
    In such cases, you may set this parameter to :obj:`True` to avoid this.
 
    This option is relevant for the Slurm backends only.
 
+   .. seealso:: :attr:`~config.systems.partitions.sched_options.slurm_job_cancel_reasons`
+
 .. py:attribute:: systems.partitions.sched_options.job_submit_timeout
 
    :required: No
@@ -490,6 +492,45 @@ System Partition Configuration
    .. versionadded:: 4.8
 
 
+.. py:attribute:: systems.partitions.sched_options.slurm_job_cancel_reasons
+
+   :required: No
+   :default: ``["ReqNodeNotAvail"]``
+
+   Reasons to proactively cancel a pending Slurm job.
+
+   If a job associated to a test is in pending state with one of the reasons listed here, ReFrame will cancel the job.
+
+   This option is relevant for the Slurm backends only.
+
+   .. versionadded:: 4.10
+
+   .. seealso::
+
+      :attr:`~config.systems.partitions.sched_options.ignore_reqnodenotavail`
+      :attr:`~config.systems.partitions.sched_options.slurm_pending_job_reason_poll_freq`
+
+
+.. py:attribute:: systems.partitions.sched_options.slurm_pending_job_reason_poll_freq
+
+   :required: No
+   :default: ``10``
+
+   Frequency of polling for the reason a Slurm job is pending.
+
+   When using the ``slurm`` backend, ReFrame needs to explicitly issue an ``squeue`` command to get the reason a job is pending, in order to cancel it if it is blocked indefinitely.
+   This option controls the frequency of this polling.
+   It is an integer number counting the number of job polling cycles before issuing the ``squeue`` command.
+   ReFrame will issue a single ``squeue`` command to query all pending jobs at once.
+   However, if your system is sensitive to Slurm RPC calls, you may consider increasing this value.
+
+   This option is relevant for the ``slurm`` backend only.
+
+   .. versionadded:: 4.10
+
+   .. seealso:: :attr:`~config.systems.partitions.sched_options.slurm_job_cancel_reasons`
+
+
 .. py:attribute:: systems.partitions.sched_options.unqualified_hostnames
 
    :required: No
@@ -1647,7 +1688,7 @@ The additional properties for the ``httpjson`` handler are the following:
    These may depend on the server configuration.
 
    .. note::
-      If you specify an authorization header here, it will be evaluated at the start of the test session and potentially expire. 
+      If you specify an authorization header here, it will be evaluated at the start of the test session and potentially expire.
       Consider using the :attr:`~config.logging.handlers_perflog..httpjson..authorization_header` parameter instead for dynamic authorization headers.
 
    .. versionadded:: 4.2
diff --git a/docs/manpage.rst b/docs/manpage.rst
@@ -2050,7 +2050,7 @@ Whenever an environment variable is associated with a configuration option, its
 
 .. envvar:: RFM_IGNORE_REQNODENOTAVAIL
 
-   Do not treat specially jobs in pending state with the reason ``ReqNodeNotAvail`` (Slurm only).
+   Do not cancel jobs that are pending due to the reason ``ReqNodeNotAvail`` (Slurm only).
 
    .. table::
       :align: left
@@ -2402,6 +2402,36 @@ Whenever an environment variable is associated with a configuration option, its
 .. versionadded:: 4.7
 
 
+.. envvar:: RFM_SLURM_JOB_CANCEL_REASONS
+
+   Reasons to proactively cancel a pending Slurm job.
+
+   .. table::
+      :align: left
+
+      ================================== ==================
+      Associated command line option     N/A
+      Associated configuration parameter :attr:`~config.systems.partitions.sched_options.slurm_job_cancel_reasons`
+      ================================== ==================
+
+   .. versionadded:: 4.10
+
+
+.. envvar:: RFM_SLURM_PENDING_JOB_REASON_POLL_FREQ
+
+   Frequency of polling for the reason a Slurm job is pending.
+
+   .. table::
+      :align: left
+
+      ================================== ==================
+      Associated command line option     N/A
+      Associated configuration parameter :attr:`~config.systems.partitions.sched_options.slurm_pending_job_reason_poll_freq`
+      ================================== ==================
+
+   .. versionadded:: 4.10
+
+
 .. envvar:: RFM_STAGE_DIR
 
    Directory prefix for staging test resources.
diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py
@@ -175,6 +175,18 @@ def cancel(self, job):
         :meta private:
         '''
 
+    def cancel_many(self, jobs):
+        '''Cancel multiple jobs at once.
+
+        By default, this method calls :meth:`cancel` for each job. Backends
+        may override this method to implement more efficient cancellation.
+
+        :arg jobs: The job descriptors to cancel.
+        :meta private:
+        '''
+        for job in jobs:
+            self.cancel(job)
+
     @abc.abstractmethod
     def finished(self, job):
         '''Poll a job.
diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py
@@ -20,7 +20,6 @@
 from reframe.core.backends import register_scheduler
 from reframe.core.exceptions import (SpawnedProcessError,
                                      JobBlockedError,
-                                     JobError,
                                      JobSchedulerError)
 from reframe.utility import nodelist_abbrev, seconds_to_hms
 
@@ -101,13 +100,6 @@ def is_cancelling(self):
 
 @register_scheduler('slurm')
 class SlurmJobScheduler(sched.JobScheduler):
-    # In some systems, scheduler performance is sensitive to the squeue poll
-    # ratio. In this backend, squeue is used to obtain the reason a job is
-    # blocked, so as to cancel it if it is blocked indefinitely. The following
-    # variable controls the frequency of squeue polling compared to the
-    # standard job state polling using sacct.
-    SACCT_SQUEUE_RATIO = 10
-
     # This matches the format for both normal and heterogeneous jobs,
     # as well as job arrays.
     # For heterogeneous jobs, the job_id has the following format:
@@ -125,24 +117,19 @@ def __init__(self):
         # Reasons to cancel a pending job: if the job is expected to remain
         # pending for a much longer time then usual (mostly if a sysadmin
         # intervention is required)
-        self._cancel_reasons = ['FrontEndDown',
-                                'Licenses',         # May require sysadmin
-                                'NodeDown',
-                                'PartitionDown',
-                                'PartitionInactive',
-                                'PartitionNodeLimit',
-                                'QOSJobLimit',
-                                'QOSResourceLimit',
-                                'QOSUsageThreshold']
-        ignore_reqnodenotavail = self.get_option('ignore_reqnodenotavail')
-        if not ignore_reqnodenotavail:
-            self._cancel_reasons.append('ReqNodeNotAvail')
+        self._cancel_reasons = self.get_option('slurm_job_cancel_reasons')
+        if self.get_option('ignore_reqnodenotavail'):
+            with suppress(ValueError):
+                self._cancel_reasons.remove('ReqNodeNotAvail')
 
         self._update_state_count = 0
         self._submit_timeout = self.get_option('job_submit_timeout')
         self._use_nodes_opt = self.get_option('use_nodes_option')
         self._resubmit_on_errors = self.get_option('resubmit_on_errors')
         self._max_sacct_failures = self.get_option('max_sacct_failures')
+        self._pending_job_reason_poll_freq = self.get_option(
+            'slurm_pending_job_reason_poll_freq'
+        )
         self._num_sacct_failures = 0
         self._sched_access_in_submit = self.get_option(
             'sched_access_in_submit'
@@ -529,6 +516,7 @@ def poll(self, *jobs):
             jobid = re.split(r'_|\+', s.group('jobid'))[0]
             job_info.setdefault(jobid, []).append(s)
 
+        # Update job information
         for job in jobs:
             try:
                 jobarr_info = job_info[job.jobid]
@@ -538,10 +526,6 @@ def poll(self, *jobs):
             # Join the states with ',' in case of job arrays|heterogeneous jobs
             job._state = ','.join(m.group('state') for m in jobarr_info)
 
-            if not self._update_state_count % self.SACCT_SQUEUE_RATIO:
-                self._cancel_if_blocked(job)
-
-            self._cancel_if_pending_too_long(job)
             if slurm_state_completed(job.state):
                 # Since Slurm exitcodes are positive take the maximum one
                 job._exitcode = max(
@@ -554,74 +538,71 @@ def poll(self, *jobs):
                 job, (m.group('end') for m in jobarr_info)
             )
 
-    def _cancel_if_pending_too_long(self, job):
-        if not job.max_pending_time or not slurm_state_pending(job.state):
-            return
+        # Cancel jobs that blocked or pending for too long
+        if not self._update_state_count % self._pending_job_reason_poll_freq:
+            self._cancel_if_blocked(jobs)
+
+        self._cancel_if_pending_too_long(jobs)
+
+    def _cancel_if_pending_too_long(self, jobs):
+        cancel_joblist = []
+        for job in jobs:
+            if not job.max_pending_time or not slurm_state_pending(job.state):
+                continue
 
-        t_pending = time.time() - job.submit_time
-        if t_pending >= job.max_pending_time:
-            self.log('maximum pending time for job exceeded; cancelling it')
-            self.cancel(job)
-            job._exception = JobError('maximum pending time exceeded',
-                                      job.jobid)
+            t_pending = time.time() - job.submit_time
+            if t_pending >= job.max_pending_time:
+                self.log('maximum pending time for job exceeded; cancelling it')
+                cancel_joblist.append(job)
 
-    def _cancel_if_blocked(self, job, reasons=None):
-        if (job.is_cancelling or not slurm_state_pending(job.state)):
+        if cancel_joblist:
+            self.cancel_many(cancel_joblist)
+
+    def _cancel_if_blocked(self, jobs, reasons=None):
+        pending_jobs = {
+            job.jobid: job for job in jobs
+            if not job.is_cancelling and slurm_state_pending(job.state)
+        }
+        if not pending_jobs:
             return
 
-        if not reasons:
-            completed = osext.run_command('squeue -h -j %s -o %%r' % job.jobid)
-            reasons = completed.stdout.splitlines()
-            if not reasons:
-                # Can't retrieve job's state. Perhaps it has finished already
-                # and does not show up in the output of squeue
-                return
+        if reasons:
+            for jobid, reason in reasons.items():
+                try:
+                    pending_reasons[pending_jobs[jobid]] = reason
+                except KeyError:
+                    # Job listed in reasons is not pending
+                    continue
+        else:
+            job_ids = ",".join(pending_jobs.keys())
+            completed = osext.run_command(f'squeue -h -j {job_ids} -o "%A|%r"')
+            pending_reasons = {}
+            for line in completed.stdout.splitlines():
+                jobid, reason = line.split('|', maxsplit=1)
+                pending_reasons[pending_jobs[jobid]] = reason
+
+        cancel_joblist = {}
+        for job, reason_descr in pending_reasons.items():
+            # The reason description may have two parts as follows:
+            # "ReqNodeNotAvail, UnavailableNodes:nid00[408,411-415]"
+            reason = reason_descr.split(',', maxsplit=1)[0].strip()
+            if reason not in self._cancel_reasons:
+                continue
 
-        # For slurm job arrays the squeue output consists of multiple lines
-        for r in reasons:
-            self._do_cancel_if_blocked(job, r)
+            cancel_joblist[job] = reason_descr
 
-    def _do_cancel_if_blocked(self, job, reason_descr):
-        '''Check if blocking reason ``reason_descr`` is unrecoverable and
-        cancel the job in this case.'''
+        if not cancel_joblist:
+            return
 
-        # The reason description may have two parts as follows:
-        # "ReqNodeNotAvail, UnavailableNodes:nid00[408,411-415]"
-        try:
-            reason, reason_details = reason_descr.split(',', maxsplit=1)
-        except ValueError:
-            # no reason details
-            reason, reason_details = reason_descr, None
-
-        if reason in self._cancel_reasons:
-            if reason == 'ReqNodeNotAvail' and reason_details:
-                self.log('Job blocked due to ReqNodeNotAvail')
-                node_match = re.match(
-                    r'UnavailableNodes:(?P<node_names>\S+)?',
-                    reason_details.strip()
-                )
-                if node_match:
-                    node_names = node_match['node_names']
-                    if node_names:
-                        # Retrieve the info of the unavailable nodes and check
-                        # if they are indeed down. According to Slurm's docs
-                        # this should not be necessary, but we check anyways
-                        # to be on the safe side.
-                        self.log(f'Checking if nodes {node_names!r} '
-                                 f'are indeed unavailable')
-                        nodes = self._get_nodes_by_name(node_names)
-                        if not any(self.is_node_down(n) for n in nodes):
-                            return
-
-                        self.cancel(job)
-                        reason_msg = (
-                            'job cancelled because it was blocked due to '
-                            'a perhaps non-recoverable reason: ' + reason
-                        )
-                        if reason_details is not None:
-                            reason_msg += ', ' + reason_details
-
-                        job._exception = JobBlockedError(reason_msg, job.jobid)
+        # Cancel all jobs at once
+        self.cancel_many(cancel_joblist.keys())
+
+        # Set the job exception
+        for job, reason_descr in cancel_joblist.items():
+            job._exception = JobBlockedError(
+                f'job cancelled because it was blocked '
+                f'due to reason: {reason_descr}', job.jobid
+            )
 
     def wait(self, job):
         # Quickly return in case we have finished already
@@ -640,8 +621,17 @@ def wait(self, job):
             self._merge_files(job)
 
     def cancel(self, job):
-        _run_strict(f'scancel {job.jobid}', timeout=self._submit_timeout)
-        job._is_cancelling = True
+        self.cancel_many([job])
+
+    def cancel_many(self, jobs):
+        if not jobs:
+            self.log('no jobs to cancel')
+            return
+
+        _run_strict(f'scancel {" ".join(job.jobid for job in jobs)}',
+                    timeout=self._submit_timeout)
+        for job in jobs:
+            job._is_cancelling = True
 
     def finished(self, job):
         return slurm_state_completed(job.state)
@@ -707,10 +697,11 @@ def poll(self, *jobs):
 
             # Use ',' to join nodes to be consistent with Slurm syntax
             job._nodespec = ','.join(m.group('nodespec') for m in job_match)
-            self._cancel_if_blocked(
-                job, [s.group('reason') for s in state_match]
-            )
-            self._cancel_if_pending_too_long(job)
+
+        self._cancel_if_blocked(
+            jobs, {jobid: s.group('reason') for jobid, s in jobinfo.items()}
+        )
+        self._cancel_if_pending_too_long(jobs)
 
 
 def _create_nodes(descriptions):
diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py
diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json