Skip to content

Commit 0e8d95f

Browse files
authored
feat: health check for scheduler (ENG-2154) (#43)
PR #42 added the check for expected periodic jobs to the worker health check, but that doesn't actually help because it restarts the worker process rather than the scheduler process. This creates a health check for the scheduler and moves the periodic jobs check to that. Fixes: [ENG-2154](https://stacklet.atlassian.net/browse/ENG-2154)
1 parent b6a5527 commit 0e8d95f

4 files changed

Lines changed: 65 additions & 10 deletions

File tree

bin/docker-entrypoint

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ set -e
44
scheduler() {
55
echo "Starting RQ scheduler..."
66

7-
exec /app/manage.py rq scheduler
7+
exec supervisord -c scheduler.conf
88
}
99

1010
dev_scheduler() {

redash/cli/rq.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,34 @@ def scheduler():
3232
rq_scheduler.run()
3333

3434

35+
class SchedulerHealthcheck(base.BaseCheck):
36+
NAME = "RQ Scheduler Healthcheck"
37+
38+
def __call__(self, process_spec):
39+
pjobs_ok, num_pjobs, num_missing_pjobs = check_periodic_jobs()
40+
41+
is_healthy = pjobs_ok
42+
43+
self._log(
44+
"Scheduler healthcheck: "
45+
"Periodic jobs ok? %s (%s/%s jobs scheduled). "
46+
"==> Is healthy? %s",
47+
pjobs_ok,
48+
num_pjobs - num_missing_pjobs,
49+
num_pjobs,
50+
is_healthy,
51+
)
52+
53+
return is_healthy
54+
55+
56+
@manager.command()
57+
def scheduler_healthcheck():
58+
return check_runner.CheckRunner(
59+
"scheduler_healthcheck", "scheduler", None, [(SchedulerHealthcheck, {})]
60+
).run()
61+
62+
3563
@manager.command()
3664
@argument("queues", nargs=-1)
3765
def worker(queues):
@@ -76,33 +104,27 @@ def __call__(self, process_spec):
76104
total_jobs_in_watched_queues = sum([len(q.jobs) for q in worker.queues])
77105
has_nothing_to_do = total_jobs_in_watched_queues == 0
78106

79-
pjobs_ok, num_pjobs, num_missing_pjobs = check_periodic_jobs()
80-
81-
is_healthy = (is_busy or seen_lately or has_nothing_to_do) and pjobs_ok
107+
is_healthy = is_busy or seen_lately or has_nothing_to_do
82108

83109
self._log(
84110
"Worker %s healthcheck: Is busy? %s. "
85111
"Seen lately? %s (%d seconds ago). "
86112
"Has nothing to do? %s (%d jobs in watched queues). "
87-
"Periodic jobs ok? %s (%s missing of %s). "
88113
"==> Is healthy? %s",
89114
worker.key,
90115
is_busy,
91116
seen_lately,
92117
time_since_seen.seconds,
93118
has_nothing_to_do,
94119
total_jobs_in_watched_queues,
95-
pjobs_ok,
96-
num_missing_pjobs,
97-
num_pjobs,
98120
is_healthy,
99121
)
100122

101123
return is_healthy
102124

103125

104126
@manager.command()
105-
def healthcheck():
127+
def worker_healthcheck():
106128
return check_runner.CheckRunner(
107129
"worker_healthcheck", "worker", None, [(WorkerHealthcheck, {})]
108130
).run()

scheduler.conf

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
[supervisord]
2+
logfile=/dev/null
3+
pidfile=/tmp/supervisord.pid
4+
nodaemon=true
5+
6+
[unix_http_server]
7+
file = /tmp/supervisor.sock
8+
9+
[rpcinterface:supervisor]
10+
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
11+
12+
[program:scheduler]
13+
command=./manage.py rq scheduler
14+
process_name=%(program_name)s-%(process_num)s
15+
numprocs=1
16+
directory=/app
17+
stopsignal=TERM
18+
autostart=true
19+
autorestart=true
20+
startsecs=300
21+
stdout_logfile=/dev/stdout
22+
stdout_logfile_maxbytes=0
23+
stderr_logfile=/dev/stderr
24+
stderr_logfile_maxbytes=0
25+
26+
[eventlistener:scheduler_healthcheck]
27+
serverurl=AUTO
28+
command=./manage.py rq scheduler_healthcheck
29+
stdout_logfile=/dev/stdout
30+
stdout_logfile_maxbytes=0
31+
stderr_logfile=/dev/stderr
32+
stderr_logfile_maxbytes=0
33+
events=TICK_60

worker.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ stderr_logfile_maxbytes=0
2525

2626
[eventlistener:worker_healthcheck]
2727
serverurl=AUTO
28-
command=./manage.py rq healthcheck
28+
command=./manage.py rq worker_healthcheck
2929
stdout_logfile=/dev/stdout
3030
stdout_logfile_maxbytes=0
3131
stderr_logfile=/dev/stderr

0 commit comments

Comments
 (0)