Skip to content

Commit 23ed0e3

Browse files
feat(configurable shutdown duration): make shutdown duration configurable (#17479)
We want to make the graceful shutdown period configurable instead of hardcoding it to sixty seconds. Issues: #9042 Remove 60s hard cutoff period for graceful shutdowns #12831 Want to adjust graceful shutdown time This is my first PR in vector, not sure if this is the correct approach: - are the ergonomics (-1 for no timeout, 0+ for timeout durations) good? - any test recommendations beyond manual testing? --------- Co-authored-by: Bruce Guenter <bruce.guenter@datadoghq.com>
1 parent 7a4f1f7 commit 23ed0e3

12 files changed

Lines changed: 130 additions & 53 deletions

File tree

lib/vector-common/src/shutdown.rs

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ impl SourceShutdownCoordinator {
200200
///
201201
/// Panics if this coordinator has had its triggers removed (ie
202202
/// has been taken over with `Self::takeover_source`).
203-
pub fn shutdown_all(self, deadline: Instant) -> impl Future<Output = ()> {
203+
pub fn shutdown_all(self, deadline: Option<Instant>) -> impl Future<Output = ()> {
204204
let mut complete_futures = Vec::new();
205205

206206
let shutdown_begun_triggers = self.shutdown_begun_triggers;
@@ -275,7 +275,7 @@ impl SourceShutdownCoordinator {
275275
shutdown_complete_tripwire,
276276
shutdown_force_trigger,
277277
id.clone(),
278-
deadline,
278+
Some(deadline),
279279
)
280280
}
281281

@@ -297,23 +297,27 @@ impl SourceShutdownCoordinator {
297297
shutdown_complete_tripwire: Tripwire,
298298
shutdown_force_trigger: Trigger,
299299
id: ComponentKey,
300-
deadline: Instant,
300+
deadline: Option<Instant>,
301301
) -> impl Future<Output = bool> {
302302
async move {
303-
// Call `shutdown_force_trigger.disable()` on drop.
304-
let shutdown_force_trigger = DisabledTrigger::new(shutdown_force_trigger);
305-
306303
let fut = shutdown_complete_tripwire.then(tripwire_handler);
307-
if timeout_at(deadline, fut).await.is_ok() {
308-
shutdown_force_trigger.into_inner().disable();
309-
true
304+
if let Some(deadline) = deadline {
305+
// Call `shutdown_force_trigger.disable()` on drop.
306+
let shutdown_force_trigger = DisabledTrigger::new(shutdown_force_trigger);
307+
if timeout_at(deadline, fut).await.is_ok() {
308+
shutdown_force_trigger.into_inner().disable();
309+
true
310+
} else {
311+
error!(
312+
"Source '{}' failed to shutdown before deadline. Forcing shutdown.",
313+
id,
314+
);
315+
shutdown_force_trigger.into_inner().cancel();
316+
false
317+
}
310318
} else {
311-
error!(
312-
"Source '{}' failed to shutdown before deadline. Forcing shutdown.",
313-
id,
314-
);
315-
shutdown_force_trigger.into_inner().cancel();
316-
false
319+
fut.await;
320+
true
317321
}
318322
}
319323
.boxed()

src/app.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#![allow(missing_docs)]
2-
use std::{collections::HashMap, num::NonZeroUsize, path::PathBuf};
2+
use std::{collections::HashMap, num::NonZeroUsize, path::PathBuf, time::Duration};
33

44
use exitcode::ExitCode;
55
use futures::StreamExt;
@@ -62,10 +62,14 @@ impl ApplicationConfig {
6262
) -> Result<Self, ExitCode> {
6363
let config_paths = opts.config_paths_with_formats();
6464

65+
let graceful_shutdown_duration = (!opts.no_graceful_shutdown_limit)
66+
.then(|| Duration::from_secs(u64::from(opts.graceful_shutdown_limit_secs)));
67+
6568
let config = load_configs(
6669
&config_paths,
6770
opts.watch_config,
6871
opts.require_healthy,
72+
graceful_shutdown_duration,
6973
signal_handler,
7074
)
7175
.await?;
@@ -410,6 +414,7 @@ pub async fn load_configs(
410414
config_paths: &[ConfigPath],
411415
watch_config: bool,
412416
require_healthy: Option<bool>,
417+
graceful_shutdown_duration: Option<Duration>,
413418
signal_handler: &mut SignalHandler,
414419
) -> Result<Config, ExitCode> {
415420
let config_paths = config::process_paths(config_paths).ok_or(exitcode::CONFIG)?;
@@ -440,6 +445,7 @@ pub async fn load_configs(
440445
info!("Health checks are disabled.");
441446
}
442447
config.healthchecks.set_require_healthy(require_healthy);
448+
config.graceful_shutdown_duration = graceful_shutdown_duration;
443449

444450
Ok(config)
445451
}

src/cli.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#![allow(missing_docs)]
2-
use std::path::PathBuf;
2+
use std::{num::NonZeroU64, path::PathBuf};
33

44
use clap::{ArgAction, CommandFactory, FromArgMatches, Parser};
55

@@ -159,6 +159,28 @@ pub struct RootOpts {
159159
)]
160160
pub internal_log_rate_limit: u64,
161161

162+
/// Set the duration in seconds to wait for graceful shutdown after SIGINT or SIGTERM are
163+
/// received. After the duration has passed, Vector will force shutdown. To never force
164+
/// shutdown, use `--no-graceful-shutdown-limit`.
165+
#[arg(
166+
long,
167+
default_value = "60",
168+
env = "VECTOR_GRACEFUL_SHUTDOWN_LIMIT_SECS",
169+
group = "graceful-shutdown-limit"
170+
)]
171+
pub graceful_shutdown_limit_secs: NonZeroU64,
172+
173+
/// Never time out while waiting for graceful shutdown after SIGINT or SIGTERM received.
174+
/// This is useful when you would like for Vector to attempt to send data until terminated
175+
/// by a SIGKILL. Overrides/cannot be set with `--graceful-shutdown-limit-secs`.
176+
#[arg(
177+
long,
178+
default_value = "false",
179+
env = "VECTOR_NO_GRACEFUL_SHUTDOWN_LIMIT",
180+
group = "graceful-shutdown-limit"
181+
)]
182+
pub no_graceful_shutdown_limit: bool,
183+
162184
/// Set runtime allocation tracing
163185
#[cfg(feature = "allocation-tracing")]
164186
#[arg(long, env = "ALLOCATION_TRACING", default_value = "false")]

src/config/builder.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#[cfg(feature = "enterprise")]
22
use std::collections::BTreeMap;
3-
use std::path::Path;
3+
use std::{path::Path, time::Duration};
44

55
use indexmap::IndexMap;
66
#[cfg(feature = "enterprise")]
@@ -78,6 +78,13 @@ pub struct ConfigBuilder {
7878
/// All configured secrets backends.
7979
#[serde(default)]
8080
pub secret: IndexMap<ComponentKey, SecretBackends>,
81+
82+
/// The duration in seconds to wait for graceful shutdown after SIGINT or SIGTERM are received.
83+
/// After the duration has passed, Vector will force shutdown. Default value is 60 seconds. This
84+
/// value can be set using a [cli arg](crate::cli::RootOpts::graceful_shutdown_limit_secs).
85+
#[serde(default, skip)]
86+
#[doc(hidden)]
87+
pub graceful_shutdown_duration: Option<Duration>,
8188
}
8289

8390
#[cfg(feature = "enterprise")]
@@ -195,6 +202,7 @@ impl From<Config> for ConfigBuilder {
195202
transforms,
196203
tests,
197204
secret,
205+
graceful_shutdown_duration,
198206
hash: _,
199207
} = config;
200208

@@ -225,6 +233,7 @@ impl From<Config> for ConfigBuilder {
225233
provider: None,
226234
tests,
227235
secret,
236+
graceful_shutdown_duration,
228237
}
229238
}
230239
}

src/config/compiler.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ pub fn compile(mut builder: ConfigBuilder) -> Result<(Config, Vec<String>), Vec<
5656
tests,
5757
provider: _,
5858
secret,
59+
graceful_shutdown_duration,
5960
} = builder;
6061

6162
let graph = match Graph::new(&sources, &transforms, &sinks, schema) {
@@ -111,6 +112,7 @@ pub fn compile(mut builder: ConfigBuilder) -> Result<(Config, Vec<String>), Vec<
111112
transforms,
112113
tests,
113114
secret,
115+
graceful_shutdown_duration,
114116
};
115117

116118
config.propagate_acknowledgements()?;

src/config/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use std::{
55
hash::Hash,
66
net::SocketAddr,
77
path::PathBuf,
8+
time::Duration,
89
};
910

1011
use indexmap::IndexMap;
@@ -105,6 +106,7 @@ pub struct Config {
105106
pub enrichment_tables: IndexMap<ComponentKey, EnrichmentTableOuter>,
106107
tests: Vec<TestDefinition>,
107108
secret: IndexMap<ComponentKey, SecretBackends>,
109+
pub graceful_shutdown_duration: Option<Duration>,
108110
}
109111

110112
impl Config {

src/sources/journald.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1108,7 +1108,7 @@ mod tests {
11081108

11091109
sleep(Duration::from_millis(100)).await;
11101110
shutdown
1111-
.shutdown_all(Instant::now() + Duration::from_secs(1))
1111+
.shutdown_all(Some(Instant::now() + Duration::from_secs(1)))
11121112
.await;
11131113

11141114
timeout(Duration::from_secs(1), rx.collect()).await.unwrap()

src/sources/statsd/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ mod test {
487487
// everything that was in up without having to know the exact count.
488488
sleep(Duration::from_millis(250)).await;
489489
shutdown
490-
.shutdown_all(Instant::now() + Duration::from_millis(100))
490+
.shutdown_all(Some(Instant::now() + Duration::from_millis(100)))
491491
.await;
492492

493493
// Read all the events into a `MetricState`, which handles normalizing metrics and tracking
@@ -579,7 +579,7 @@ mod test {
579579
// everything that was in up without having to know the exact count.
580580
sleep(Duration::from_millis(250)).await;
581581
shutdown
582-
.shutdown_all(Instant::now() + Duration::from_millis(100))
582+
.shutdown_all(Some(Instant::now() + Duration::from_millis(100)))
583583
.await;
584584
}
585585
}

src/sources/syslog.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,7 +1153,7 @@ mod test {
11531153

11541154
// Shutdown the source, and make sure we've got all the messages we sent in.
11551155
shutdown
1156-
.shutdown_all(Instant::now() + Duration::from_millis(100))
1156+
.shutdown_all(Some(Instant::now() + Duration::from_millis(100)))
11571157
.await;
11581158
shutdown_complete.await;
11591159

@@ -1230,7 +1230,7 @@ mod test {
12301230
sleep(Duration::from_secs(1)).await;
12311231

12321232
shutdown
1233-
.shutdown_all(Instant::now() + Duration::from_millis(100))
1233+
.shutdown_all(Some(Instant::now() + Duration::from_millis(100)))
12341234
.await;
12351235
shutdown_complete.await;
12361236

@@ -1307,7 +1307,7 @@ mod test {
13071307

13081308
// Shutdown the source, and make sure we've got all the messages we sent in.
13091309
shutdown
1310-
.shutdown_all(Instant::now() + Duration::from_millis(100))
1310+
.shutdown_all(Some(Instant::now() + Duration::from_millis(100)))
13111311
.await;
13121312
shutdown_complete.await;
13131313

src/topology/running.rs

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ pub struct RunningTopology {
4545
abort_tx: mpsc::UnboundedSender<()>,
4646
watch: (WatchTx, WatchRx),
4747
pub(crate) running: Arc<AtomicBool>,
48+
graceful_shutdown_duration: Option<Duration>,
4849
}
4950

5051
impl RunningTopology {
@@ -54,14 +55,15 @@ impl RunningTopology {
5455
inputs_tap_metadata: HashMap::new(),
5556
outputs: HashMap::new(),
5657
outputs_tap_metadata: HashMap::new(),
57-
config,
5858
shutdown_coordinator: SourceShutdownCoordinator::default(),
5959
detach_triggers: HashMap::new(),
6060
source_tasks: HashMap::new(),
6161
tasks: HashMap::new(),
6262
abort_tx,
6363
watch: watch::channel(TapResource::default()),
6464
running: Arc::new(AtomicBool::new(true)),
65+
graceful_shutdown_duration: config.graceful_shutdown_duration,
66+
config,
6567
}
6668
}
6769

@@ -120,30 +122,36 @@ impl RunningTopology {
120122
check_handles.entry(key).or_default().push(task);
121123
}
122124

123-
// If we reach this, we will forcefully shutdown the sources.
124-
let deadline = Instant::now() + Duration::from_secs(60);
125-
126-
// If we reach the deadline, this future will print out which components
127-
// won't gracefully shutdown since we will start to forcefully shutdown
128-
// the sources.
129-
let mut check_handles2 = check_handles.clone();
130-
let timeout = async move {
131-
sleep_until(deadline).await;
132-
// Remove all tasks that have shutdown.
133-
check_handles2.retain(|_key, handles| {
134-
retain(handles, |handle| handle.peek().is_none());
135-
!handles.is_empty()
136-
});
137-
let remaining_components = check_handles2
138-
.keys()
139-
.map(|item| item.to_string())
140-
.collect::<Vec<_>>()
141-
.join(", ");
125+
// If we reach this, we will forcefully shutdown the sources. If None, we will never force shutdown.
126+
let deadline = self
127+
.graceful_shutdown_duration
128+
.map(|grace_period| Instant::now() + grace_period);
142129

143-
error!(
144-
components = ?remaining_components,
145-
"Failed to gracefully shut down in time. Killing components."
146-
);
130+
let timeout = if let Some(deadline) = deadline {
131+
// If we reach the deadline, this future will print out which components
132+
// won't gracefully shutdown since we will start to forcefully shutdown
133+
// the sources.
134+
let mut check_handles2 = check_handles.clone();
135+
Box::pin(async move {
136+
sleep_until(deadline).await;
137+
// Remove all tasks that have shutdown.
138+
check_handles2.retain(|_key, handles| {
139+
retain(handles, |handle| handle.peek().is_none());
140+
!handles.is_empty()
141+
});
142+
let remaining_components = check_handles2
143+
.keys()
144+
.map(|item| item.to_string())
145+
.collect::<Vec<_>>()
146+
.join(", ");
147+
148+
error!(
149+
components = ?remaining_components,
150+
"Failed to gracefully shut down in time. Killing components."
151+
);
152+
}) as future::BoxFuture<'static, ()>
153+
} else {
154+
Box::pin(future::pending()) as future::BoxFuture<'static, ()>
147155
};
148156

149157
// Reports in intervals which components are still running.
@@ -163,10 +171,12 @@ impl RunningTopology {
163171
.collect::<Vec<_>>()
164172
.join(", ");
165173

166-
let time_remaining = match deadline.checked_duration_since(Instant::now()) {
167-
Some(remaining) => format!("{} seconds left", remaining.as_secs()),
168-
None => "overdue".to_string(),
169-
};
174+
let time_remaining = deadline
175+
.map(|d| match d.checked_duration_since(Instant::now()) {
176+
Some(remaining) => format!("{} seconds left", remaining.as_secs()),
177+
None => "overdue".to_string(),
178+
})
179+
.unwrap_or("no time limit".to_string());
170180

171181
info!(
172182
remaining_components = ?remaining_components,

0 commit comments

Comments
 (0)