@@ -2090,6 +2090,105 @@ def _queue_tasks(tis):
20902090 states = [x .state for x in dr .get_task_instances (session = session )]
20912091 assert states == ["failed" , "failed" ]
20922092
2093+ @conf_vars ({("scheduler" , "num_stuck_in_queued_retries" ): "2" })
2094+ def test_handle_stuck_queued_tasks_reschedule_sensors (self , dag_maker , session , mock_executors ):
2095+ """Reschedule sensors go in and out of running repeatedly using the same try_number
2096+ Make sure that they get three attempts per reschedule, not 3 attempts per try_number"""
2097+ with dag_maker ("test_fail_stuck_queued_tasks_multiple_executors" ):
2098+ EmptyOperator (task_id = "op1" )
2099+ EmptyOperator (task_id = "op2" , executor = "default_exec" )
2100+
2101+ def _queue_tasks (tis ):
2102+ for ti in tis :
2103+ ti .state = "queued"
2104+ ti .queued_dttm = timezone .utcnow ()
2105+ session .commit ()
2106+
2107+ def _add_running_event (tis ):
2108+ for ti in tis :
2109+ updated_entry = Log (
2110+ dttm = timezone .utcnow (),
2111+ dag_id = ti .dag_id ,
2112+ task_id = ti .task_id ,
2113+ map_index = ti .map_index ,
2114+ event = "running" ,
2115+ run_id = ti .run_id ,
2116+ try_number = ti .try_number ,
2117+ )
2118+ session .add (updated_entry )
2119+
2120+ run_id = str (uuid4 ())
2121+ dr = dag_maker .create_dagrun (run_id = run_id )
2122+
2123+ tis = dr .get_task_instances (session = session )
2124+ _queue_tasks (tis = tis )
2125+ scheduler_job = Job ()
2126+ scheduler = SchedulerJobRunner (job = scheduler_job , num_runs = 0 )
2127+ # job_runner._reschedule_stuck_task = MagicMock()
2128+ scheduler ._task_queued_timeout = - 300 # always in violation of timeout
2129+
2130+ with _loader_mock (mock_executors ):
2131+ scheduler ._handle_tasks_stuck_in_queued ()
2132+ # If the task gets stuck in queued once, we reset it to scheduled
2133+ tis = dr .get_task_instances (session = session )
2134+ assert [x .state for x in tis ] == ["scheduled" , "scheduled" ]
2135+ assert [x .queued_dttm for x in tis ] == [None , None ]
2136+
2137+ _queue_tasks (tis = tis )
2138+ log_events = [
2139+ x .event for x in session .scalars (select (Log ).where (Log .run_id == run_id ).order_by (Log .id )).all ()
2140+ ]
2141+ assert log_events == [
2142+ "stuck in queued reschedule" ,
2143+ "stuck in queued reschedule" ,
2144+ ]
2145+
2146+ with _loader_mock (mock_executors ):
2147+ scheduler ._handle_tasks_stuck_in_queued ()
2148+
2149+ log_events = [
2150+ x .event for x in session .scalars (select (Log ).where (Log .run_id == run_id ).order_by (Log .id )).all ()
2151+ ]
2152+ assert log_events == [
2153+ "stuck in queued reschedule" ,
2154+ "stuck in queued reschedule" ,
2155+ "stuck in queued reschedule" ,
2156+ "stuck in queued reschedule" ,
2157+ ]
2158+ mock_executors [0 ].fail .assert_not_called ()
2159+ tis = dr .get_task_instances (session = session )
2160+ assert [x .state for x in tis ] == ["scheduled" , "scheduled" ]
2161+
2162+ _add_running_event (tis ) # This should "reset" the count of stuck queued
2163+
2164+ for _ in range (3 ): # Should be able to be stuck 3 more times before failing
2165+ _queue_tasks (tis = tis )
2166+ with _loader_mock (mock_executors ):
2167+ scheduler ._handle_tasks_stuck_in_queued ()
2168+ tis = dr .get_task_instances (session = session )
2169+
2170+ log_events = [
2171+ x .event for x in session .scalars (select (Log ).where (Log .run_id == run_id ).order_by (Log .id )).all ()
2172+ ]
2173+ assert log_events == [
2174+ "stuck in queued reschedule" ,
2175+ "stuck in queued reschedule" ,
2176+ "stuck in queued reschedule" ,
2177+ "stuck in queued reschedule" ,
2178+ "running" ,
2179+ "running" ,
2180+ "stuck in queued reschedule" ,
2181+ "stuck in queued reschedule" ,
2182+ "stuck in queued reschedule" ,
2183+ "stuck in queued reschedule" ,
2184+ "stuck in queued tries exceeded" ,
2185+ "stuck in queued tries exceeded" ,
2186+ ]
2187+
2188+ mock_executors [0 ].fail .assert_not_called () # just demoing that we don't fail with executor method
2189+ states = [x .state for x in dr .get_task_instances (session = session )]
2190+ assert states == ["failed" , "failed" ]
2191+
20932192 def test_revoke_task_not_imp_tolerated (self , dag_maker , session , caplog ):
20942193 """Test that if executor no implement revoke_task then we don't blow up."""
20952194 with dag_maker ("test_fail_stuck_queued_tasks" ):
0 commit comments