9898DR = DagRun
9999DM = DagModel
100100
101- RESCHEDULE_STUCK_IN_QUEUED_EVENT = "rescheduling stuck in queued"
101+ STUCK_IN_QUEUED_EVENT = "stuck in queued"
102+ """:meta private:"""
102103
103104
104105class ConcurrencyMap :
@@ -1790,7 +1791,7 @@ def _handle_tasks_stuck_in_queued(self, session: Session = NEW_SESSION) -> None:
17901791
17911792 As a compromise between always failing a stuck task and always rescheduling a stuck task (which could
17921793 lead to tasks being stuck in queued forever without informing the user), we have creating the config
1793- `[core] num_stuck_reschedules `. With this new configuration, an airflow admin can decide how
1794+ ``[scheduler] num_stuck_in_queued_retries` `. With this new configuration, an airflow admin can decide how
17941795 sensitive they would like their airflow to be WRT failing stuck tasks.
17951796 """
17961797 self .log .debug ("Calling SchedulerJob._fail_tasks_stuck_in_queued method" )
@@ -1803,65 +1804,73 @@ def _handle_tasks_stuck_in_queued(self, session: Session = NEW_SESSION) -> None:
18031804 )
18041805 ).all ()
18051806
1806- num_allowed_retries = conf .getint ("core " , "num_stuck_reschedules " )
1807+ num_allowed_retries = conf .getint ("scheduler " , "num_stuck_in_queued_retries " )
18071808 for executor , stuck_tis in self ._executor_to_tis (tasks_stuck_in_queued ).items ():
1808- try :
1809- cleaned_up_task_instances = set (executor .cleanup_stuck_queued_tasks (tis = stuck_tis ))
1810- for ti in stuck_tis :
1811- if repr (ti ) in cleaned_up_task_instances :
1812- num_times_stuck = self ._get_num_times_stuck_in_queued (ti , session )
1813- if num_times_stuck < num_allowed_retries :
1814- self .log .warning (
1815- "Task %s was stuck in queued and will be requeued, once it has hit %s attempts"
1816- " the task will be marked as failed. After that, if the task instance has "
1817- "available retries, it will be retried." , ti .key , num_allowed_retries
1818- )
1819- session .add (
1820- Log (
1821- event = RESCHEDULE_STUCK_IN_QUEUED_EVENT ,
1822- task_instance = ti .key ,
1823- extra = (
1824- f"Task was stuck in queued and will be requeued, once it has hit { num_allowed_retries } attempts"
1825- "Task will be marked as failed. After that, if the task instance has "
1826- "available retries, it will be retried."
1827- ),
1828- )
1829- )
1809+ if not hasattr (executor , "cleanup_stuck_queued_tasks" ):
1810+ continue
18301811
1831- executor .change_state (ti .key , State .SCHEDULED )
1832- session .execute (
1833- update (TI )
1834- .where (TI .filter_for_tis ([ti ]))
1835- .values (
1836- # TODO[ha]: should we use func.now()? How does that work with DB timezone
1837- # on mysql when it's not UTC?
1838- state = TaskInstanceState .SCHEDULED ,
1839- queued_dttm = None ,
1840- # queued_by_job_id=None,
1841- )
1842- .execution_options (synchronize_session = False )
1843- )
1844- else :
1845- self .log .warning (
1846- "Marking task instance %s stuck in queued as failed. "
1847- "If the task instance has available retries, it will be retried." ,
1848- ti ,
1849- )
1850- session .add (
1851- Log (
1852- event = "failing stuck in queued" ,
1853- task_instance = ti .key ,
1854- extra = (
1855- "Task will be marked as failed. If the task instance has "
1856- "available retries, it will be retried."
1857- ),
1858- )
1859- )
1860- executor .fail (ti .key )
1812+ for ti in executor .cleanup_stuck_queued_tasks (tis = stuck_tis ):
1813+ if not isinstance (ti , TaskInstance ):
1814+ # this is for backcompat. the pre-2.10.4 version of the interface
1815+ # expected a string return val.
1816+ self .log .warning (
1817+ "Marking task instance %s stuck in queued as failed. "
1818+ "If the task instance has available retries, it will be retried." ,
1819+ ti ,
1820+ )
1821+ continue
18611822
1823+ session .add (
1824+ Log (
1825+ event = STUCK_IN_QUEUED_EVENT ,
1826+ task_instance = ti .key ,
1827+ extra = (
1828+ "Task was in queued state for longer "
1829+ f"than { self ._task_queued_timeout } seconds."
1830+ ),
1831+ )
1832+ )
1833+ self .log .warning ("Task stuck in queued and may be requeued task_id=%s" , ti .key )
1834+
1835+ num_times_stuck = self ._get_num_times_stuck_in_queued (ti , session )
1836+ if num_times_stuck < num_allowed_retries :
1837+ session .add (
1838+ Log (
1839+ event = STUCK_IN_QUEUED_EVENT ,
1840+ task_instance = ti .key ,
1841+ extra = (
1842+ f"Task was stuck in queued and will be requeued, once it has hit { num_allowed_retries } attempts"
1843+ "Task will be marked as failed. After that, if the task instance has "
1844+ "available retries, it will be retried."
1845+ ),
1846+ )
1847+ )
18621848
1863- except NotImplementedError :
1864- self .log .debug ("Executor doesn't support cleanup of stuck queued tasks. Skipping." )
1849+ executor .change_state (ti .key , State .SCHEDULED )
1850+ session .execute (
1851+ update (TI )
1852+ .where (TI .filter_for_tis ([ti ]))
1853+ .values (
1854+ state = TaskInstanceState .SCHEDULED ,
1855+ queued_dttm = None ,
1856+ )
1857+ .execution_options (synchronize_session = False )
1858+ )
1859+ else :
1860+ self .log .warning (
1861+ "Task requeue attempts exceeded max; marking failed. task_instance=%s" , ti
1862+ )
1863+ session .add (
1864+ Log (
1865+ event = "stuck in queued tries exceeded" ,
1866+ task_instance = ti .key ,
1867+ extra = (
1868+ f"Task was requeued more than { num_allowed_retries } times "
1869+ "and will be failed."
1870+ ),
1871+ )
1872+ )
1873+ executor .fail (ti .key )
18651874
18661875 @provide_session
18671876 def _get_num_times_stuck_in_queued (self , ti : TaskInstance , session : Session = NEW_SESSION ) -> int :
@@ -1871,14 +1880,16 @@ def _get_num_times_stuck_in_queued(self, ti: TaskInstance, session: Session = NE
18711880 We can then use this information to determine whether to reschedule a task or fail it.
18721881 """
18731882 return (
1874- session .query (Log ).where (
1883+ session .query (Log )
1884+ .where (
18751885 Log .task_id == ti .task_id ,
18761886 Log .dag_id == ti .dag_id ,
18771887 Log .run_id == ti .run_id ,
18781888 Log .map_index == ti .map_index ,
18791889 Log .try_number == ti .try_number ,
1880- Log .event == RESCHEDULE_STUCK_IN_QUEUED_EVENT ,
1881- ).count ()
1890+ Log .event == STUCK_IN_QUEUED_EVENT ,
1891+ )
1892+ .count ()
18821893 )
18831894
18841895 @provide_session
0 commit comments