LCOV - code coverage report
Current view: top level - pengine - failcounts.c (source / functions) Hit Total Coverage
Test: Pacemaker code coverage Lines: 0 153 0.0 %
Date: 2024-05-07 11:09:47 Functions: 0 9 0.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright 2008-2024 the Pacemaker project contributors
       3             :  *
       4             :  * This source code is licensed under the GNU Lesser General Public License
       5             :  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
       6             :  */
       7             : 
       8             : #include <crm_internal.h>
       9             : 
      10             : #include <sys/types.h>
      11             : #include <regex.h>
      12             : #include <glib.h>
      13             : 
      14             : #include <crm/crm.h>
      15             : #include <crm/common/xml.h>
      16             : #include <crm/common/util.h>
      17             : #include <crm/pengine/internal.h>
      18             : 
      19             : static gboolean
      20           0 : is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
      21             :                    const xmlNode *lrm_op_xml)
      22             : {
      23           0 :     gboolean matched = FALSE;
      24           0 :     const char *conf_op_name = NULL;
      25           0 :     const char *lrm_op_task = NULL;
      26           0 :     const char *conf_op_interval_spec = NULL;
      27           0 :     guint conf_op_interval_ms = 0;
      28           0 :     guint lrm_op_interval_ms = 0;
      29           0 :     const char *lrm_op_id = NULL;
      30           0 :     char *last_failure_key = NULL;
      31             : 
      32           0 :     if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
      33           0 :         return FALSE;
      34             :     }
      35             : 
      36             :     // Get name and interval from configured op
      37           0 :     conf_op_name = crm_element_value(conf_op_xml, PCMK_XA_NAME);
      38           0 :     conf_op_interval_spec = crm_element_value(conf_op_xml, PCMK_META_INTERVAL);
      39           0 :     pcmk_parse_interval_spec(conf_op_interval_spec, &conf_op_interval_ms);
      40             : 
      41             :     // Get name and interval from op history entry
      42           0 :     lrm_op_task = crm_element_value(lrm_op_xml, PCMK_XA_OPERATION);
      43           0 :     crm_element_value_ms(lrm_op_xml, PCMK_META_INTERVAL, &lrm_op_interval_ms);
      44             : 
      45           0 :     if ((conf_op_interval_ms != lrm_op_interval_ms)
      46           0 :         || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
      47           0 :         return FALSE;
      48             :     }
      49             : 
      50           0 :     lrm_op_id = pcmk__xe_id(lrm_op_xml);
      51           0 :     last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
      52             : 
      53           0 :     if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
      54           0 :         matched = TRUE;
      55             : 
      56             :     } else {
      57           0 :         char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
      58             :                                                 conf_op_interval_ms);
      59             : 
      60           0 :         if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
      61           0 :             int rc = 0;
      62           0 :             int target_rc = pe__target_rc_from_xml(lrm_op_xml);
      63             : 
      64           0 :             crm_element_value_int(lrm_op_xml, PCMK__XA_RC_CODE, &rc);
      65           0 :             if (rc != target_rc) {
      66           0 :                 matched = TRUE;
      67             :             }
      68             :         }
      69           0 :         free(expected_op_key);
      70             :     }
      71             : 
      72           0 :     free(last_failure_key);
      73           0 :     return matched;
      74             : }
      75             : 
      76             : static gboolean
      77           0 : block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
      78             :               const xmlNode *xml_op)
      79             : {
      80           0 :     char *xml_name = clone_strip(rsc->id);
      81             : 
      82             :     /* @TODO This xpath search occurs after template expansion, but it is unable
      83             :      * to properly detect on-fail in id-ref, operation meta-attributes, or
      84             :      * op_defaults, or evaluate rules.
      85             :      *
      86             :      * Also, PCMK_META_ON_FAIL defaults to PCMK_VALUE_BLOCK (in
      87             :      * unpack_operation()) for stop actions when stonith is disabled.
      88             :      *
      89             :      * Ideally, we'd unpack the operation before this point, and pass in a
      90             :      * meta-attributes table that takes all that into consideration.
      91             :      */
      92           0 :     char *xpath = crm_strdup_printf("//" PCMK_XE_PRIMITIVE
      93             :                                     "[@" PCMK_XA_ID "='%s']"
      94             :                                     "//" PCMK_XE_OP
      95             :                                     "[@" PCMK_META_ON_FAIL
      96             :                                         "='" PCMK_VALUE_BLOCK "']",
      97             :                                     xml_name);
      98             : 
      99           0 :     xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
     100           0 :     gboolean should_block = FALSE;
     101             : 
     102           0 :     free(xpath);
     103             : 
     104           0 :     if (xpathObj) {
     105           0 :         int max = numXpathResults(xpathObj);
     106           0 :         int lpc = 0;
     107             : 
     108           0 :         for (lpc = 0; lpc < max; lpc++) {
     109           0 :             xmlNode *pref = getXpathResult(xpathObj, lpc);
     110             : 
     111           0 :             if (xml_op) {
     112           0 :                 should_block = is_matched_failure(xml_name, pref, xml_op);
     113           0 :                 if (should_block) {
     114           0 :                     break;
     115             :                 }
     116             : 
     117             :             } else {
     118           0 :                 const char *conf_op_name = NULL;
     119           0 :                 const char *conf_op_interval_spec = NULL;
     120           0 :                 guint conf_op_interval_ms = 0;
     121           0 :                 char *lrm_op_xpath = NULL;
     122           0 :                 xmlXPathObject *lrm_op_xpathObj = NULL;
     123             : 
     124             :                 // Get name and interval from configured op
     125           0 :                 conf_op_name = crm_element_value(pref, PCMK_XA_NAME);
     126           0 :                 conf_op_interval_spec = crm_element_value(pref,
     127             :                                                           PCMK_META_INTERVAL);
     128           0 :                 pcmk_parse_interval_spec(conf_op_interval_spec,
     129             :                                          &conf_op_interval_ms);
     130             : 
     131             : #define XPATH_FMT "//" PCMK__XE_NODE_STATE "[@" PCMK_XA_UNAME "='%s']"      \
     132             :                   "//" PCMK__XE_LRM_RESOURCE "[@" PCMK_XA_ID "='%s']"       \
     133             :                   "/" PCMK__XE_LRM_RSC_OP "[@" PCMK_XA_OPERATION "='%s']"   \
     134             :                   "[@" PCMK_META_INTERVAL "='%u']"
     135             : 
     136           0 :                 lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
     137           0 :                                                  node->details->uname, xml_name,
     138             :                                                  conf_op_name,
     139             :                                                  conf_op_interval_ms);
     140           0 :                 lrm_op_xpathObj = xpath_search(rsc->cluster->input, lrm_op_xpath);
     141             : 
     142           0 :                 free(lrm_op_xpath);
     143             : 
     144           0 :                 if (lrm_op_xpathObj) {
     145           0 :                     int max2 = numXpathResults(lrm_op_xpathObj);
     146           0 :                     int lpc2 = 0;
     147             : 
     148           0 :                     for (lpc2 = 0; lpc2 < max2; lpc2++) {
     149           0 :                         xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
     150             :                                                              lpc2);
     151             : 
     152           0 :                         should_block = is_matched_failure(xml_name, pref,
     153             :                                                           lrm_op_xml);
     154           0 :                         if (should_block) {
     155           0 :                             break;
     156             :                         }
     157             :                     }
     158             :                 }
     159           0 :                 freeXpathObject(lrm_op_xpathObj);
     160             : 
     161           0 :                 if (should_block) {
     162           0 :                     break;
     163             :                 }
     164             :             }
     165             :         }
     166             :     }
     167             : 
     168           0 :     free(xml_name);
     169           0 :     freeXpathObject(xpathObj);
     170             : 
     171           0 :     return should_block;
     172             : }
     173             : 
     174             : /*!
     175             :  * \internal
     176             :  * \brief Get resource name as used in failure-related node attributes
     177             :  *
     178             :  * \param[in] rsc  Resource to check
     179             :  *
     180             :  * \return Newly allocated string containing resource's fail name
     181             :  * \note The caller is responsible for freeing the result.
     182             :  */
     183             : static inline char *
     184           0 : rsc_fail_name(const pcmk_resource_t *rsc)
     185             : {
     186           0 :     const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
     187             : 
     188           0 :     return pcmk_is_set(rsc->flags, pcmk_rsc_unique)? strdup(name) : clone_strip(name);
     189             : }
     190             : 
     191             : /*!
     192             :  * \internal
     193             :  * \brief Compile regular expression to match a failure-related node attribute
     194             :  *
     195             :  * \param[in]  prefix    Attribute prefix to match
     196             :  * \param[in]  rsc_name  Resource name to match as used in failure attributes
     197             :  * \param[in]  is_legacy Whether DC uses per-resource fail counts
     198             :  * \param[in]  is_unique Whether the resource is a globally unique clone
     199             :  * \param[out] re        Where to store resulting regular expression
     200             :  *
     201             :  * \return Standard Pacemaker return code
     202             :  * \note Fail attributes are named like PREFIX-RESOURCE#OP_INTERVAL.
     203             :  *       The caller is responsible for freeing re with regfree().
     204             :  */
     205             : static int
     206           0 : generate_fail_regex(const char *prefix, const char *rsc_name,
     207             :                     gboolean is_legacy, gboolean is_unique, regex_t *re)
     208             : {
     209             :     char *pattern;
     210             : 
     211             :     /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
     212             :      * per-operation.
     213             :      */
     214           0 :     const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
     215             : 
     216             :     /* Ignore instance numbers for anything other than globally unique clones.
     217             :      * Anonymous clone fail counts could contain an instance number if the
     218             :      * clone was initially unique, failed, then was converted to anonymous.
     219             :      * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
     220             :      * clone instance numbers.
     221             :      */
     222           0 :     const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
     223             : 
     224           0 :     pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
     225             :                                 instance_pattern, op_pattern);
     226           0 :     if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
     227           0 :         free(pattern);
     228           0 :         return EINVAL;
     229             :     }
     230             : 
     231           0 :     free(pattern);
     232           0 :     return pcmk_rc_ok;
     233             : }
     234             : 
     235             : /*!
     236             :  * \internal
     237             :  * \brief Compile regular expressions to match failure-related node attributes
     238             :  *
     239             :  * \param[in]  rsc             Resource being checked for failures
     240             :  * \param[out] failcount_re    Storage for regular expression for fail count
     241             :  * \param[out] lastfailure_re  Storage for regular expression for last failure
     242             :  *
     243             :  * \return Standard Pacemaker return code
     244             :  * \note On success, the caller is responsible for freeing the expressions with
     245             :  *       regfree().
     246             :  */
     247             : static int
     248           0 : generate_fail_regexes(const pcmk_resource_t *rsc,
     249             :                       regex_t *failcount_re, regex_t *lastfailure_re)
     250             : {
     251           0 :     int rc = pcmk_rc_ok;
     252           0 :     char *rsc_name = rsc_fail_name(rsc);
     253           0 :     const char *version = crm_element_value(rsc->cluster->input,
     254             :                                             PCMK_XA_CRM_FEATURE_SET);
     255             : 
     256             :     // @COMPAT Pacemaker <= 1.1.16 used a single fail count per resource
     257           0 :     gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
     258             : 
     259           0 :     if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
     260           0 :                             pcmk_is_set(rsc->flags, pcmk_rsc_unique),
     261             :                             failcount_re) != pcmk_rc_ok) {
     262           0 :         rc = EINVAL;
     263             : 
     264           0 :     } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
     265             :                                    is_legacy,
     266           0 :                                    pcmk_is_set(rsc->flags, pcmk_rsc_unique),
     267             :                                    lastfailure_re) != pcmk_rc_ok) {
     268           0 :         rc = EINVAL;
     269           0 :         regfree(failcount_re);
     270             :     }
     271             : 
     272           0 :     free(rsc_name);
     273           0 :     return rc;
     274             : }
     275             : 
     276             : // Data for fail-count-related iterators
     277             : struct failcount_data {
     278             :     const pcmk_node_t *node;// Node to check for fail count
     279             :     pcmk_resource_t *rsc;     // Resource to check for fail count
     280             :     uint32_t flags;         // Fail count flags
     281             :     const xmlNode *xml_op;  // History entry for expiration purposes (or NULL)
     282             :     regex_t failcount_re;   // Fail count regular expression to match
     283             :     regex_t lastfailure_re; // Last failure regular expression to match
     284             :     int failcount;          // Fail count so far
     285             :     time_t last_failure;    // Time of most recent failure so far
     286             : };
     287             : 
     288             : /*!
     289             :  * \internal
     290             :  * \brief Update fail count and last failure appropriately for a node attribute
     291             :  *
     292             :  * \param[in] key        Node attribute name
     293             :  * \param[in] value      Node attribute value
     294             :  * \param[in] user_data  Fail count data to update
     295             :  */
     296             : static void
     297           0 : update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
     298             : {
     299           0 :     struct failcount_data *fc_data = user_data;
     300             : 
     301             :     // If this is a matching fail count attribute, update fail count
     302           0 :     if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
     303           0 :         fc_data->failcount = pcmk__add_scores(fc_data->failcount,
     304             :                                               char2score(value));
     305           0 :         pcmk__rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
     306             :                         (const char *) key, (const char *) value,
     307             :                         fc_data->rsc->id,
     308             :                         pcmk_readable_score(fc_data->failcount));
     309           0 :         return;
     310             :     }
     311             : 
     312             :     // If this is a matching last failure attribute, update last failure
     313           0 :     if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
     314             :                 0) == 0) {
     315             :         long long last_ll;
     316             : 
     317           0 :         if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
     318           0 :             fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure,
     319             :                                                     last_ll);
     320             :         }
     321             :     }
     322             : }
     323             : 
     324             : /*!
     325             :  * \internal
     326             :  * \brief Update fail count and last failure appropriately for a filler resource
     327             :  *
     328             :  * \param[in] data       Filler resource
     329             :  * \param[in] user_data  Fail count data to update
     330             :  */
     331             : static void
     332           0 : update_failcount_for_filler(gpointer data, gpointer user_data)
     333             : {
     334           0 :     pcmk_resource_t *filler = data;
     335           0 :     struct failcount_data *fc_data = user_data;
     336           0 :     time_t filler_last_failure = 0;
     337             : 
     338           0 :     fc_data->failcount += pe_get_failcount(fc_data->node, filler,
     339             :                                            &filler_last_failure, fc_data->flags,
     340             :                                            fc_data->xml_op);
     341           0 :     fc_data->last_failure = QB_MAX(fc_data->last_failure, filler_last_failure);
     342           0 : }
     343             : 
     344             : /*!
     345             :  * \internal
     346             :  * \brief Get a resource's fail count on a node
     347             :  *
     348             :  * \param[in]     node          Node to check
     349             :  * \param[in,out] rsc           Resource to check
     350             :  * \param[out]    last_failure  If not NULL, where to set time of most recent
     351             :  *                              failure of \p rsc on \p node
     352             :  * \param[in]     flags         Group of enum pcmk__fc_flags
     353             :  * \param[in]     xml_op        If not NULL, consider only the action in this
     354             :  *                              history entry when determining whether on-fail
     355             :  *                              is configured as "blocked", otherwise consider
     356             :  *                              all actions configured for \p rsc
     357             :  *
     358             :  * \return Fail count for \p rsc on \p node according to \p flags
     359             :  */
     360             : int
     361           0 : pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc,
     362             :                  time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
     363             : {
     364           0 :     struct failcount_data fc_data = {
     365             :         .node = node,
     366             :         .rsc = rsc,
     367             :         .flags = flags,
     368             :         .xml_op = xml_op,
     369             :         .failcount = 0,
     370             :         .last_failure = (time_t) 0,
     371             :     };
     372             : 
     373             :     // Calculate resource failcount as sum of all matching operation failcounts
     374           0 :     CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
     375             :                                     &fc_data.lastfailure_re) == pcmk_rc_ok,
     376             :               return 0);
     377           0 :     g_hash_table_foreach(node->details->attrs, update_failcount_for_attr,
     378             :                          &fc_data);
     379           0 :     regfree(&(fc_data.failcount_re));
     380           0 :     regfree(&(fc_data.lastfailure_re));
     381             : 
     382             :     // If failure blocks the resource, disregard any failure timeout
     383           0 :     if ((fc_data.failcount > 0) && (rsc->failure_timeout > 0)
     384           0 :         && block_failure(node, rsc, xml_op)) {
     385             : 
     386           0 :         pcmk__config_warn("Ignoring failure timeout %d for %s "
     387             :                           "because it conflicts with "
     388             :                           PCMK_META_ON_FAIL "=" PCMK_VALUE_BLOCK,
     389             :                           rsc->failure_timeout, rsc->id);
     390           0 :         rsc->failure_timeout = 0;
     391             :     }
     392             : 
     393             :     // If all failures have expired, ignore fail count
     394           0 :     if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
     395           0 :         && (fc_data.last_failure > 0) && (rsc->failure_timeout != 0)) {
     396             : 
     397           0 :         time_t now = get_effective_time(rsc->cluster);
     398             : 
     399           0 :         if (now > (fc_data.last_failure + rsc->failure_timeout)) {
     400           0 :             pcmk__rsc_debug(rsc, "Failcount for %s on %s expired after %ds",
     401             :                             rsc->id, pcmk__node_name(node),
     402             :                             rsc->failure_timeout);
     403           0 :             fc_data.failcount = 0;
     404             :         }
     405             :     }
     406             : 
     407             :     /* Add the fail count of any filler resources, except that we never want the
     408             :      * fail counts of a bundle container's fillers to count towards the
     409             :      * container's fail count.
     410             :      *
     411             :      * Most importantly, a Pacemaker Remote connection to a bundle container
     412             :      * is a filler of the container, but can reside on a different node than the
     413             :      * container itself. Counting its fail count on its node towards the
     414             :      * container's fail count on that node could lead to attempting to stop the
     415             :      * container on the wrong node.
     416             :      */
     417           0 :     if (pcmk_is_set(flags, pcmk__fc_fillers) && (rsc->fillers != NULL)
     418           0 :         && !pcmk__is_bundled(rsc)) {
     419             : 
     420           0 :         g_list_foreach(rsc->fillers, update_failcount_for_filler, &fc_data);
     421           0 :         if (fc_data.failcount > 0) {
     422           0 :             pcmk__rsc_info(rsc,
     423             :                            "Container %s and the resources within it "
     424             :                            "have failed %s time%s on %s",
     425             :                            rsc->id, pcmk_readable_score(fc_data.failcount),
     426             :                            pcmk__plural_s(fc_data.failcount),
     427             :                            pcmk__node_name(node));
     428             :         }
     429             : 
     430           0 :     } else if (fc_data.failcount > 0) {
     431           0 :         pcmk__rsc_info(rsc, "%s has failed %s time%s on %s",
     432             :                        rsc->id, pcmk_readable_score(fc_data.failcount),
     433             :                        pcmk__plural_s(fc_data.failcount),
     434             :                        pcmk__node_name(node));
     435             :     }
     436             : 
     437           0 :     if (last_failure != NULL) {
     438           0 :         if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
     439           0 :             *last_failure = fc_data.last_failure;
     440             :         } else  {
     441           0 :             *last_failure = 0;
     442             :         }
     443             :     }
     444           0 :     return fc_data.failcount;
     445             : }
     446             : 
     447             : /*!
     448             :  * \brief Schedule a controller operation to clear a fail count
     449             :  *
     450             :  * \param[in,out] rsc        Resource with failure
     451             :  * \param[in]     node       Node failure occurred on
     452             :  * \param[in]     reason     Readable description why needed (for logging)
     453             :  * \param[in,out] scheduler  Scheduler data cluster
     454             :  *
     455             :  * \return Scheduled action
     456             :  */
     457             : pcmk_action_t *
     458           0 : pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node,
     459             :                     const char *reason, pcmk_scheduler_t *scheduler)
     460             : {
     461           0 :     char *key = NULL;
     462           0 :     pcmk_action_t *clear = NULL;
     463             : 
     464           0 :     CRM_CHECK(rsc && node && reason && scheduler, return NULL);
     465             : 
     466           0 :     key = pcmk__op_key(rsc->id, PCMK_ACTION_CLEAR_FAILCOUNT, 0);
     467           0 :     clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
     468             :                           scheduler);
     469           0 :     pcmk__insert_meta(clear, PCMK__META_OP_NO_WAIT, PCMK_VALUE_TRUE);
     470           0 :     crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
     471             :                rsc->id, pcmk__node_name(node), reason, clear->uuid);
     472           0 :     return clear;
     473             : }

Generated by: LCOV version 1.14