LCOV - code coverage report
Current view: top level - common - watchdog.c (source / functions) Hit Total Coverage
Test: Pacemaker code coverage Lines: 0 106 0.0 %
Date: 2024-05-07 11:09:47 Functions: 0 9 0.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright 2013-2024 the Pacemaker project contributors
       3             :  *
       4             :  * The version control history for this file may have further details.
       5             :  *
       6             :  * This source code is licensed under the GNU Lesser General Public License
       7             :  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
       8             :  */
       9             : 
      10             : #include <crm_internal.h>
      11             : 
      12             : #include <sched.h>
      13             : #include <sys/ioctl.h>
      14             : #include <sys/reboot.h>
      15             : 
      16             : #include <sys/types.h>
      17             : #include <sys/stat.h>
      18             : #include <unistd.h>
      19             : #include <ctype.h>
      20             : #include <dirent.h>
      21             : #include <signal.h>
      22             : 
      23             : static pid_t sbd_pid = 0;
      24             : 
      25             : static void
      26           0 : sysrq_trigger(char t)
      27             : {
      28             : #if HAVE_LINUX_PROCFS
      29             :     FILE *procf;
      30             : 
      31             :     // Root can always write here, regardless of kernel.sysrq value
      32           0 :     procf = fopen("/proc/sysrq-trigger", "a");
      33           0 :     if (!procf) {
      34           0 :         crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
      35           0 :         return;
      36             :     }
      37           0 :     crm_info("sysrq-trigger: %c", t);
      38           0 :     fprintf(procf, "%c\n", t);
      39           0 :     fclose(procf);
      40             : #endif // HAVE_LINUX_PROCFS
      41           0 :     return;
      42             : }
      43             : 
      44             : 
      45             : /*!
      46             :  * \internal
      47             :  * \brief Panic the local host (if root) or tell pacemakerd to do so
      48             :  */
      49             : static void
      50           0 : panic_local(void)
      51             : {
      52           0 :     int rc = pcmk_ok;
      53           0 :     uid_t uid = geteuid();
      54           0 :     pid_t ppid = getppid();
      55           0 :     const char *panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
      56             : 
      57           0 :     if(uid != 0 && ppid > 1) {
      58             :         /* We're a non-root pacemaker daemon (pacemaker-based,
      59             :          * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
      60             :          * the original pacemakerd parent.
      61             :          *
      62             :          * Of these, only the controller is likely to be initiating resets.
      63             :          */
      64           0 :         crm_emerg("Signaling parent %lld to panic", (long long) ppid);
      65           0 :         crm_exit(CRM_EX_PANIC);
      66             :         return;
      67             : 
      68           0 :     } else if (uid != 0) {
      69             : #if HAVE_LINUX_PROCFS
      70             :         /*
      71             :          * No permissions, and no pacemakerd parent to escalate to.
      72             :          * Track down the new pacemakerd process and send a signal instead.
      73             :          */
      74             :         union sigval signal_value;
      75             : 
      76           0 :         memset(&signal_value, 0, sizeof(signal_value));
      77           0 :         ppid = pcmk__procfs_pid_of("pacemakerd");
      78           0 :         crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
      79             : 
      80           0 :         if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
      81           0 :             crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
      82             :                        (long long) ppid);
      83             :         }
      84             : #endif // HAVE_LINUX_PROCFS
      85             : 
      86             :         /* The best we can do now is die */
      87           0 :         crm_exit(CRM_EX_PANIC);
      88             :         return;
      89             :     }
      90             : 
      91             :     /* We're either pacemakerd, or a pacemaker daemon running as root */
      92             : 
      93           0 :     if (pcmk__str_eq(panic_action, "crash", pcmk__str_casei)) {
      94           0 :         sysrq_trigger('c');
      95             : 
      96           0 :     } else if (pcmk__str_eq(panic_action, "sync-crash", pcmk__str_casei)) {
      97           0 :         sync();
      98           0 :         sysrq_trigger('c');
      99             : 
     100             :     } else {
     101           0 :         if (pcmk__str_eq(panic_action, "sync-reboot", pcmk__str_casei)) {
     102           0 :             sync();
     103             :         }
     104           0 :         sysrq_trigger('b');
     105             :     }
     106             :     /* reboot(RB_HALT_SYSTEM); rc = errno; */
     107           0 :     reboot(RB_AUTOBOOT);
     108           0 :     rc = errno;
     109             : 
     110           0 :     crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
     111             :               (long long) ppid, pcmk_rc_str(rc), rc);
     112             : 
     113           0 :     if(ppid > 1) {
     114             :         /* child daemon */
     115           0 :         exit(CRM_EX_PANIC);
     116             :     } else {
     117             :         /* pacemakerd or orphan child */
     118           0 :         exit(CRM_EX_FATAL);
     119             :     }
     120             : }
     121             : 
     122             : /*!
     123             :  * \internal
     124             :  * \brief Tell sbd to kill the local host, then exit
     125             :  */
     126             : static void
     127           0 : panic_sbd(void)
     128             : {
     129             :     union sigval signal_value;
     130           0 :     pid_t ppid = getppid();
     131             : 
     132           0 :     crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
     133             : 
     134           0 :     memset(&signal_value, 0, sizeof(signal_value));
     135             :     /* TODO: Arrange for a slightly less brutal option? */
     136           0 :     if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
     137           0 :         crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
     138             :                    (long long) sbd_pid);
     139           0 :         panic_local();
     140             :     }
     141             : 
     142           0 :     if(ppid > 1) {
     143             :         /* child daemon */
     144           0 :         exit(CRM_EX_PANIC);
     145             :     } else {
     146             :         /* pacemakerd or orphan child */
     147           0 :         exit(CRM_EX_FATAL);
     148             :     }
     149             : }
     150             : 
     151             : /*!
     152             :  * \internal
     153             :  * \brief Panic the local host
     154             :  *
     155             :  * Panic the local host either by sbd (if running), directly, or by asking
     156             :  * pacemakerd. If trace logging this function, exit instead.
     157             :  *
     158             :  * \param[in] origin   Function caller (for logging only)
     159             :  */
     160             : void
     161           0 : pcmk__panic(const char *origin)
     162             : {
     163             :     /* Ensure sbd_pid is set */
     164           0 :     (void) pcmk__locate_sbd();
     165             : 
     166           0 :     pcmk__if_tracing(
     167             :         {
     168             :             // getppid() == 1 means our original parent no longer exists
     169             :             crm_emerg("Shutting down instead of panicking the node "
     170             :                       CRM_XS " origin=%s sbd=%lld parent=%d",
     171             :                       origin, (long long) sbd_pid, getppid());
     172             :             crm_exit(CRM_EX_FATAL);
     173             :             return;
     174             :         },
     175             :         {}
     176             :     );
     177             : 
     178           0 :     if(sbd_pid > 1) {
     179           0 :         crm_emerg("Signaling sbd[%lld] to panic the system: %s",
     180             :                   (long long) sbd_pid, origin);
     181           0 :         panic_sbd();
     182             : 
     183             :     } else {
     184           0 :         crm_emerg("Panicking the system directly: %s", origin);
     185           0 :         panic_local();
     186             :     }
     187             : }
     188             : 
     189             : /*!
     190             :  * \internal
     191             :  * \brief Return the process ID of sbd (or 0 if it is not running)
     192             :  */
     193             : pid_t
     194           0 : pcmk__locate_sbd(void)
     195             : {
     196           0 :     char *pidfile = NULL;
     197           0 :     char *sbd_path = NULL;
     198             :     int rc;
     199             : 
     200           0 :     if(sbd_pid > 1) {
     201           0 :         return sbd_pid;
     202             :     }
     203             : 
     204             :     /* Look for the pid file */
     205           0 :     pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
     206           0 :     sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
     207             : 
     208             :     /* Read the pid file */
     209           0 :     rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
     210           0 :     if (rc == pcmk_rc_ok) {
     211           0 :         crm_trace("SBD detected at pid %lld (via PID file %s)",
     212             :                   (long long) sbd_pid, pidfile);
     213             : 
     214             : #if HAVE_LINUX_PROCFS
     215             :     } else {
     216             :         /* Fall back to /proc for systems that support it */
     217           0 :         sbd_pid = pcmk__procfs_pid_of("sbd");
     218           0 :         crm_trace("SBD detected at pid %lld (via procfs)",
     219             :                   (long long) sbd_pid);
     220             : #endif // HAVE_LINUX_PROCFS
     221             :     }
     222             : 
     223           0 :     if(sbd_pid < 0) {
     224           0 :         sbd_pid = 0;
     225           0 :         crm_trace("SBD not detected");
     226             :     }
     227             : 
     228           0 :     free(pidfile);
     229           0 :     free(sbd_path);
     230             : 
     231           0 :     return sbd_pid;
     232             : }
     233             : 
     234             : long
     235           0 : pcmk__get_sbd_watchdog_timeout(void)
     236             : {
     237             :     static long sbd_timeout = -2;
     238             : 
     239           0 :     if (sbd_timeout == -2) {
     240           0 :         sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
     241             :     }
     242           0 :     return sbd_timeout;
     243             : }
     244             : 
     245             : bool
     246           0 : pcmk__get_sbd_sync_resource_startup(void)
     247             : {
     248             :     static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
     249             :     static bool checked_sync_resource_startup = false;
     250             : 
     251           0 :     if (!checked_sync_resource_startup) {
     252           0 :         const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
     253             : 
     254           0 :         if (sync_env == NULL) {
     255           0 :             crm_trace("Defaulting to %sstart-up synchronization with sbd",
     256             :                       (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
     257             : 
     258           0 :         } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
     259           0 :             crm_warn("Defaulting to %sstart-up synchronization with sbd "
     260             :                      "because environment value '%s' is invalid",
     261             :                      (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
     262             :         }
     263           0 :         checked_sync_resource_startup = true;
     264             :     }
     265           0 :     return sync_resource_startup != 0;
     266             : }
     267             : 
     268             : long
     269           0 : pcmk__auto_stonith_watchdog_timeout(void)
     270             : {
     271           0 :     long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
     272             : 
     273           0 :     return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
     274             : }
     275             : 
     276             : bool
     277           0 : pcmk__valid_stonith_watchdog_timeout(const char *value)
     278             : {
     279             :     /* @COMPAT At a compatibility break, accept either negative values or a
     280             :      * specific string like "auto" (but not both) to mean "auto-calculate the
     281             :      * timeout." Reject other values that aren't parsable as timeouts.
     282             :      */
     283           0 :     long st_timeout = value? crm_get_msec(value) : 0;
     284             : 
     285           0 :     if (st_timeout < 0) {
     286           0 :         st_timeout = pcmk__auto_stonith_watchdog_timeout();
     287           0 :         crm_debug("Using calculated value %ld for "
     288             :                   PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " (%s)",
     289             :                   st_timeout, value);
     290             :     }
     291             : 
     292           0 :     if (st_timeout == 0) {
     293           0 :         crm_debug("Watchdog may be enabled but "
     294             :                   PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " is disabled (%s)",
     295             :                   value? value : "default");
     296             : 
     297           0 :     } else if (pcmk__locate_sbd() == 0) {
     298           0 :         crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
     299             :                   " configured (%s) but SBD not active",
     300             :                   pcmk__s(value, "auto"));
     301           0 :         crm_exit(CRM_EX_FATAL);
     302             :         return false;
     303             : 
     304             :     } else {
     305           0 :         long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
     306             : 
     307           0 :         if (st_timeout < sbd_timeout) {
     308           0 :             crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
     309             :                       " (%s) too short (must be >%ldms)",
     310             :                       value, sbd_timeout);
     311           0 :             crm_exit(CRM_EX_FATAL);
     312             :             return false;
     313             :         }
     314           0 :         crm_info("Watchdog configured with " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
     315             :                  " %s and SBD timeout %ldms",
     316             :                  value, sbd_timeout);
     317             :     }
     318           0 :     return true;
     319             : }

Generated by: LCOV version 1.14