Line data Source code
1 : /*
2 : * Copyright 2013-2024 the Pacemaker project contributors
3 : *
4 : * The version control history for this file may have further details.
5 : *
6 : * This source code is licensed under the GNU Lesser General Public License
7 : * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8 : */
9 :
10 : #include <crm_internal.h>
11 :
12 : #include <sched.h>
13 : #include <sys/ioctl.h>
14 : #include <sys/reboot.h>
15 :
16 : #include <sys/types.h>
17 : #include <sys/stat.h>
18 : #include <unistd.h>
19 : #include <ctype.h>
20 : #include <dirent.h>
21 : #include <signal.h>
22 :
23 : static pid_t sbd_pid = 0;
24 :
25 : static void
26 0 : sysrq_trigger(char t)
27 : {
28 : #if HAVE_LINUX_PROCFS
29 : FILE *procf;
30 :
31 : // Root can always write here, regardless of kernel.sysrq value
32 0 : procf = fopen("/proc/sysrq-trigger", "a");
33 0 : if (!procf) {
34 0 : crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
35 0 : return;
36 : }
37 0 : crm_info("sysrq-trigger: %c", t);
38 0 : fprintf(procf, "%c\n", t);
39 0 : fclose(procf);
40 : #endif // HAVE_LINUX_PROCFS
41 0 : return;
42 : }
43 :
44 :
45 : /*!
46 : * \internal
47 : * \brief Panic the local host (if root) or tell pacemakerd to do so
48 : */
49 : static void
50 0 : panic_local(void)
51 : {
52 0 : int rc = pcmk_ok;
53 0 : uid_t uid = geteuid();
54 0 : pid_t ppid = getppid();
55 0 : const char *panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
56 :
57 0 : if(uid != 0 && ppid > 1) {
58 : /* We're a non-root pacemaker daemon (pacemaker-based,
59 : * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
60 : * the original pacemakerd parent.
61 : *
62 : * Of these, only the controller is likely to be initiating resets.
63 : */
64 0 : crm_emerg("Signaling parent %lld to panic", (long long) ppid);
65 0 : crm_exit(CRM_EX_PANIC);
66 : return;
67 :
68 0 : } else if (uid != 0) {
69 : #if HAVE_LINUX_PROCFS
70 : /*
71 : * No permissions, and no pacemakerd parent to escalate to.
72 : * Track down the new pacemakerd process and send a signal instead.
73 : */
74 : union sigval signal_value;
75 :
76 0 : memset(&signal_value, 0, sizeof(signal_value));
77 0 : ppid = pcmk__procfs_pid_of("pacemakerd");
78 0 : crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
79 :
80 0 : if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
81 0 : crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
82 : (long long) ppid);
83 : }
84 : #endif // HAVE_LINUX_PROCFS
85 :
86 : /* The best we can do now is die */
87 0 : crm_exit(CRM_EX_PANIC);
88 : return;
89 : }
90 :
91 : /* We're either pacemakerd, or a pacemaker daemon running as root */
92 :
93 0 : if (pcmk__str_eq(panic_action, "crash", pcmk__str_casei)) {
94 0 : sysrq_trigger('c');
95 :
96 0 : } else if (pcmk__str_eq(panic_action, "sync-crash", pcmk__str_casei)) {
97 0 : sync();
98 0 : sysrq_trigger('c');
99 :
100 : } else {
101 0 : if (pcmk__str_eq(panic_action, "sync-reboot", pcmk__str_casei)) {
102 0 : sync();
103 : }
104 0 : sysrq_trigger('b');
105 : }
106 : /* reboot(RB_HALT_SYSTEM); rc = errno; */
107 0 : reboot(RB_AUTOBOOT);
108 0 : rc = errno;
109 :
110 0 : crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
111 : (long long) ppid, pcmk_rc_str(rc), rc);
112 :
113 0 : if(ppid > 1) {
114 : /* child daemon */
115 0 : exit(CRM_EX_PANIC);
116 : } else {
117 : /* pacemakerd or orphan child */
118 0 : exit(CRM_EX_FATAL);
119 : }
120 : }
121 :
122 : /*!
123 : * \internal
124 : * \brief Tell sbd to kill the local host, then exit
125 : */
126 : static void
127 0 : panic_sbd(void)
128 : {
129 : union sigval signal_value;
130 0 : pid_t ppid = getppid();
131 :
132 0 : crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
133 :
134 0 : memset(&signal_value, 0, sizeof(signal_value));
135 : /* TODO: Arrange for a slightly less brutal option? */
136 0 : if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
137 0 : crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
138 : (long long) sbd_pid);
139 0 : panic_local();
140 : }
141 :
142 0 : if(ppid > 1) {
143 : /* child daemon */
144 0 : exit(CRM_EX_PANIC);
145 : } else {
146 : /* pacemakerd or orphan child */
147 0 : exit(CRM_EX_FATAL);
148 : }
149 : }
150 :
151 : /*!
152 : * \internal
153 : * \brief Panic the local host
154 : *
155 : * Panic the local host either by sbd (if running), directly, or by asking
156 : * pacemakerd. If trace logging this function, exit instead.
157 : *
158 : * \param[in] origin Function caller (for logging only)
159 : */
160 : void
161 0 : pcmk__panic(const char *origin)
162 : {
163 : /* Ensure sbd_pid is set */
164 0 : (void) pcmk__locate_sbd();
165 :
166 0 : pcmk__if_tracing(
167 : {
168 : // getppid() == 1 means our original parent no longer exists
169 : crm_emerg("Shutting down instead of panicking the node "
170 : CRM_XS " origin=%s sbd=%lld parent=%d",
171 : origin, (long long) sbd_pid, getppid());
172 : crm_exit(CRM_EX_FATAL);
173 : return;
174 : },
175 : {}
176 : );
177 :
178 0 : if(sbd_pid > 1) {
179 0 : crm_emerg("Signaling sbd[%lld] to panic the system: %s",
180 : (long long) sbd_pid, origin);
181 0 : panic_sbd();
182 :
183 : } else {
184 0 : crm_emerg("Panicking the system directly: %s", origin);
185 0 : panic_local();
186 : }
187 : }
188 :
189 : /*!
190 : * \internal
191 : * \brief Return the process ID of sbd (or 0 if it is not running)
192 : */
193 : pid_t
194 0 : pcmk__locate_sbd(void)
195 : {
196 0 : char *pidfile = NULL;
197 0 : char *sbd_path = NULL;
198 : int rc;
199 :
200 0 : if(sbd_pid > 1) {
201 0 : return sbd_pid;
202 : }
203 :
204 : /* Look for the pid file */
205 0 : pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
206 0 : sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
207 :
208 : /* Read the pid file */
209 0 : rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
210 0 : if (rc == pcmk_rc_ok) {
211 0 : crm_trace("SBD detected at pid %lld (via PID file %s)",
212 : (long long) sbd_pid, pidfile);
213 :
214 : #if HAVE_LINUX_PROCFS
215 : } else {
216 : /* Fall back to /proc for systems that support it */
217 0 : sbd_pid = pcmk__procfs_pid_of("sbd");
218 0 : crm_trace("SBD detected at pid %lld (via procfs)",
219 : (long long) sbd_pid);
220 : #endif // HAVE_LINUX_PROCFS
221 : }
222 :
223 0 : if(sbd_pid < 0) {
224 0 : sbd_pid = 0;
225 0 : crm_trace("SBD not detected");
226 : }
227 :
228 0 : free(pidfile);
229 0 : free(sbd_path);
230 :
231 0 : return sbd_pid;
232 : }
233 :
234 : long
235 0 : pcmk__get_sbd_watchdog_timeout(void)
236 : {
237 : static long sbd_timeout = -2;
238 :
239 0 : if (sbd_timeout == -2) {
240 0 : sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
241 : }
242 0 : return sbd_timeout;
243 : }
244 :
245 : bool
246 0 : pcmk__get_sbd_sync_resource_startup(void)
247 : {
248 : static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
249 : static bool checked_sync_resource_startup = false;
250 :
251 0 : if (!checked_sync_resource_startup) {
252 0 : const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
253 :
254 0 : if (sync_env == NULL) {
255 0 : crm_trace("Defaulting to %sstart-up synchronization with sbd",
256 : (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
257 :
258 0 : } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
259 0 : crm_warn("Defaulting to %sstart-up synchronization with sbd "
260 : "because environment value '%s' is invalid",
261 : (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
262 : }
263 0 : checked_sync_resource_startup = true;
264 : }
265 0 : return sync_resource_startup != 0;
266 : }
267 :
268 : long
269 0 : pcmk__auto_stonith_watchdog_timeout(void)
270 : {
271 0 : long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
272 :
273 0 : return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
274 : }
275 :
276 : bool
277 0 : pcmk__valid_stonith_watchdog_timeout(const char *value)
278 : {
279 : /* @COMPAT At a compatibility break, accept either negative values or a
280 : * specific string like "auto" (but not both) to mean "auto-calculate the
281 : * timeout." Reject other values that aren't parsable as timeouts.
282 : */
283 0 : long st_timeout = value? crm_get_msec(value) : 0;
284 :
285 0 : if (st_timeout < 0) {
286 0 : st_timeout = pcmk__auto_stonith_watchdog_timeout();
287 0 : crm_debug("Using calculated value %ld for "
288 : PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " (%s)",
289 : st_timeout, value);
290 : }
291 :
292 0 : if (st_timeout == 0) {
293 0 : crm_debug("Watchdog may be enabled but "
294 : PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " is disabled (%s)",
295 : value? value : "default");
296 :
297 0 : } else if (pcmk__locate_sbd() == 0) {
298 0 : crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
299 : " configured (%s) but SBD not active",
300 : pcmk__s(value, "auto"));
301 0 : crm_exit(CRM_EX_FATAL);
302 : return false;
303 :
304 : } else {
305 0 : long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
306 :
307 0 : if (st_timeout < sbd_timeout) {
308 0 : crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
309 : " (%s) too short (must be >%ldms)",
310 : value, sbd_timeout);
311 0 : crm_exit(CRM_EX_FATAL);
312 : return false;
313 : }
314 0 : crm_info("Watchdog configured with " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
315 : " %s and SBD timeout %ldms",
316 : value, sbd_timeout);
317 : }
318 0 : return true;
319 : }
|