Line data Source code
1 : /*
2 : * Copyright 2004-2024 the Pacemaker project contributors
3 : *
4 : * The version control history for this file may have further details.
5 : *
6 : * This source code is licensed under the GNU Lesser General Public License
7 : * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8 : */
9 :
10 : #include <crm_internal.h>
11 :
12 : #include <stdio.h>
13 : #include <string.h>
14 : #include <glib.h>
15 : #include <time.h>
16 :
17 : #include <crm/crm.h>
18 : #include <crm/services.h>
19 : #include <crm/common/xml.h>
20 : #include <crm/common/xml_internal.h>
21 :
22 : #include <crm/common/util.h>
23 : #include <crm/pengine/rules.h>
24 : #include <crm/pengine/internal.h>
25 : #include <pe_status_private.h>
26 :
27 : CRM_TRACE_INIT_DATA(pe_status);
28 :
29 : // A (parsed) resource action history entry
30 : struct action_history {
31 : pcmk_resource_t *rsc; // Resource that history is for
32 : pcmk_node_t *node; // Node that history is for
33 : xmlNode *xml; // History entry XML
34 :
35 : // Parsed from entry XML
36 : const char *id; // XML ID of history entry
37 : const char *key; // Operation key of action
38 : const char *task; // Action name
39 : const char *exit_reason; // Exit reason given for result
40 : guint interval_ms; // Action interval
41 : int call_id; // Call ID of action
42 : int expected_exit_status; // Expected exit status of action
43 : int exit_status; // Actual exit status of action
44 : int execution_status; // Execution status of action
45 : };
46 :
47 : /* This uses pcmk__set_flags_as()/pcmk__clear_flags_as() directly rather than
48 : * use pcmk__set_scheduler_flags()/pcmk__clear_scheduler_flags() so that the
49 : * flag is stringified more readably in log messages.
50 : */
51 : #define set_config_flag(scheduler, option, flag) do { \
52 : GHashTable *config_hash = (scheduler)->config_hash; \
53 : const char *scf_value = pcmk__cluster_option(config_hash, (option)); \
54 : \
55 : if (scf_value != NULL) { \
56 : if (crm_is_true(scf_value)) { \
57 : (scheduler)->flags = pcmk__set_flags_as(__func__, __LINE__, \
58 : LOG_TRACE, "Scheduler", \
59 : crm_system_name, (scheduler)->flags, \
60 : (flag), #flag); \
61 : } else { \
62 : (scheduler)->flags = pcmk__clear_flags_as(__func__, __LINE__, \
63 : LOG_TRACE, "Scheduler", \
64 : crm_system_name, (scheduler)->flags, \
65 : (flag), #flag); \
66 : } \
67 : } \
68 : } while(0)
69 :
70 : static void unpack_rsc_op(pcmk_resource_t *rsc, pcmk_node_t *node,
71 : xmlNode *xml_op, xmlNode **last_failure,
72 : enum action_fail_response *failed);
73 : static void determine_remote_online_status(pcmk_scheduler_t *scheduler,
74 : pcmk_node_t *this_node);
75 : static void add_node_attrs(const xmlNode *xml_obj, pcmk_node_t *node,
76 : bool overwrite, pcmk_scheduler_t *scheduler);
77 : static void determine_online_status(const xmlNode *node_state,
78 : pcmk_node_t *this_node,
79 : pcmk_scheduler_t *scheduler);
80 :
81 : static void unpack_node_lrm(pcmk_node_t *node, const xmlNode *xml,
82 : pcmk_scheduler_t *scheduler);
83 :
84 :
85 : static gboolean
86 0 : is_dangling_guest_node(pcmk_node_t *node)
87 : {
88 : /* we are looking for a remote-node that was supposed to be mapped to a
89 : * container resource, but all traces of that container have disappeared
90 : * from both the config and the status section. */
91 0 : if (pcmk__is_pacemaker_remote_node(node)
92 0 : && (node->details->remote_rsc != NULL)
93 0 : && (node->details->remote_rsc->container == NULL)
94 0 : && pcmk_is_set(node->details->remote_rsc->flags,
95 : pcmk_rsc_removed_filler)) {
96 0 : return TRUE;
97 : }
98 :
99 0 : return FALSE;
100 : }
101 :
102 : /*!
103 : * \brief Schedule a fence action for a node
104 : *
105 : * \param[in,out] scheduler Scheduler data
106 : * \param[in,out] node Node to fence
107 : * \param[in] reason Text description of why fencing is needed
108 : * \param[in] priority_delay Whether to consider
109 : * \c PCMK_OPT_PRIORITY_FENCING_DELAY
110 : */
111 : void
112 0 : pe_fence_node(pcmk_scheduler_t *scheduler, pcmk_node_t *node,
113 : const char *reason, bool priority_delay)
114 : {
115 0 : CRM_CHECK(node, return);
116 :
117 : /* A guest node is fenced by marking its container as failed */
118 0 : if (pcmk__is_guest_or_bundle_node(node)) {
119 0 : pcmk_resource_t *rsc = node->details->remote_rsc->container;
120 :
121 0 : if (!pcmk_is_set(rsc->flags, pcmk_rsc_failed)) {
122 0 : if (!pcmk_is_set(rsc->flags, pcmk_rsc_managed)) {
123 0 : crm_notice("Not fencing guest node %s "
124 : "(otherwise would because %s): "
125 : "its guest resource %s is unmanaged",
126 : pcmk__node_name(node), reason, rsc->id);
127 : } else {
128 0 : pcmk__sched_warn("Guest node %s will be fenced "
129 : "(by recovering its guest resource %s): %s",
130 : pcmk__node_name(node), rsc->id, reason);
131 :
132 : /* We don't mark the node as unclean because that would prevent the
133 : * node from running resources. We want to allow it to run resources
134 : * in this transition if the recovery succeeds.
135 : */
136 0 : node->details->remote_requires_reset = TRUE;
137 0 : pcmk__set_rsc_flags(rsc,
138 : pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
139 : }
140 : }
141 :
142 0 : } else if (is_dangling_guest_node(node)) {
143 0 : crm_info("Cleaning up dangling connection for guest node %s: "
144 : "fencing was already done because %s, "
145 : "and guest resource no longer exists",
146 : pcmk__node_name(node), reason);
147 0 : pcmk__set_rsc_flags(node->details->remote_rsc,
148 : pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
149 :
150 0 : } else if (pcmk__is_remote_node(node)) {
151 0 : pcmk_resource_t *rsc = node->details->remote_rsc;
152 :
153 0 : if ((rsc != NULL) && !pcmk_is_set(rsc->flags, pcmk_rsc_managed)) {
154 0 : crm_notice("Not fencing remote node %s "
155 : "(otherwise would because %s): connection is unmanaged",
156 : pcmk__node_name(node), reason);
157 0 : } else if(node->details->remote_requires_reset == FALSE) {
158 0 : node->details->remote_requires_reset = TRUE;
159 0 : pcmk__sched_warn("Remote node %s %s: %s",
160 : pcmk__node_name(node),
161 : pe_can_fence(scheduler, node)? "will be fenced" : "is unclean",
162 : reason);
163 : }
164 0 : node->details->unclean = TRUE;
165 : // No need to apply PCMK_OPT_PRIORITY_FENCING_DELAY for remote nodes
166 0 : pe_fence_op(node, NULL, TRUE, reason, FALSE, scheduler);
167 :
168 0 : } else if (node->details->unclean) {
169 0 : crm_trace("Cluster node %s %s because %s",
170 : pcmk__node_name(node),
171 : pe_can_fence(scheduler, node)? "would also be fenced" : "also is unclean",
172 : reason);
173 :
174 : } else {
175 0 : pcmk__sched_warn("Cluster node %s %s: %s",
176 : pcmk__node_name(node),
177 : pe_can_fence(scheduler, node)? "will be fenced" : "is unclean",
178 : reason);
179 0 : node->details->unclean = TRUE;
180 0 : pe_fence_op(node, NULL, TRUE, reason, priority_delay, scheduler);
181 : }
182 : }
183 :
184 : // @TODO xpaths can't handle templates, rules, or id-refs
185 :
186 : // nvpair with provides or requires set to unfencing
187 : #define XPATH_UNFENCING_NVPAIR PCMK_XE_NVPAIR \
188 : "[(@" PCMK_XA_NAME "='" PCMK_STONITH_PROVIDES "'" \
189 : "or @" PCMK_XA_NAME "='" PCMK_META_REQUIRES "') " \
190 : "and @" PCMK_XA_VALUE "='" PCMK_VALUE_UNFENCING "']"
191 :
192 : // unfencing in rsc_defaults or any resource
193 : #define XPATH_ENABLE_UNFENCING \
194 : "/" PCMK_XE_CIB "/" PCMK_XE_CONFIGURATION "/" PCMK_XE_RESOURCES \
195 : "//" PCMK_XE_META_ATTRIBUTES "/" XPATH_UNFENCING_NVPAIR \
196 : "|/" PCMK_XE_CIB "/" PCMK_XE_CONFIGURATION "/" PCMK_XE_RSC_DEFAULTS \
197 : "/" PCMK_XE_META_ATTRIBUTES "/" XPATH_UNFENCING_NVPAIR
198 :
199 : static void
200 0 : set_if_xpath(uint64_t flag, const char *xpath, pcmk_scheduler_t *scheduler)
201 : {
202 0 : xmlXPathObjectPtr result = NULL;
203 :
204 0 : if (!pcmk_is_set(scheduler->flags, flag)) {
205 0 : result = xpath_search(scheduler->input, xpath);
206 0 : if (result && (numXpathResults(result) > 0)) {
207 0 : pcmk__set_scheduler_flags(scheduler, flag);
208 : }
209 0 : freeXpathObject(result);
210 : }
211 0 : }
212 :
213 : gboolean
214 0 : unpack_config(xmlNode *config, pcmk_scheduler_t *scheduler)
215 : {
216 0 : const char *value = NULL;
217 0 : guint interval_ms = 0U;
218 0 : GHashTable *config_hash = pcmk__strkey_table(free, free);
219 :
220 0 : pe_rule_eval_data_t rule_data = {
221 : .node_hash = NULL,
222 0 : .now = scheduler->now,
223 : .match_data = NULL,
224 : .rsc_data = NULL,
225 : .op_data = NULL
226 : };
227 :
228 0 : scheduler->config_hash = config_hash;
229 :
230 0 : pe__unpack_dataset_nvpairs(config, PCMK_XE_CLUSTER_PROPERTY_SET, &rule_data,
231 : config_hash, PCMK_VALUE_CIB_BOOTSTRAP_OPTIONS,
232 : FALSE, scheduler);
233 :
234 0 : pcmk__validate_cluster_options(config_hash);
235 :
236 0 : set_config_flag(scheduler, PCMK_OPT_ENABLE_STARTUP_PROBES,
237 : pcmk_sched_probe_resources);
238 0 : if (!pcmk_is_set(scheduler->flags, pcmk_sched_probe_resources)) {
239 0 : crm_info("Startup probes: disabled (dangerous)");
240 : }
241 :
242 0 : value = pcmk__cluster_option(config_hash, PCMK_OPT_HAVE_WATCHDOG);
243 0 : if (value && crm_is_true(value)) {
244 0 : crm_info("Watchdog-based self-fencing will be performed via SBD if "
245 : "fencing is required and " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
246 : " is nonzero");
247 0 : pcmk__set_scheduler_flags(scheduler, pcmk_sched_have_fencing);
248 : }
249 :
250 : /* Set certain flags via xpath here, so they can be used before the relevant
251 : * configuration sections are unpacked.
252 : */
253 0 : set_if_xpath(pcmk_sched_enable_unfencing, XPATH_ENABLE_UNFENCING,
254 : scheduler);
255 :
256 0 : value = pcmk__cluster_option(config_hash, PCMK_OPT_STONITH_TIMEOUT);
257 0 : pcmk_parse_interval_spec(value, &interval_ms);
258 :
259 0 : if (interval_ms >= INT_MAX) {
260 0 : scheduler->stonith_timeout = INT_MAX;
261 : } else {
262 0 : scheduler->stonith_timeout = (int) interval_ms;
263 : }
264 0 : crm_debug("STONITH timeout: %d", scheduler->stonith_timeout);
265 :
266 0 : set_config_flag(scheduler, PCMK_OPT_STONITH_ENABLED,
267 : pcmk_sched_fencing_enabled);
268 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
269 0 : crm_debug("STONITH of failed nodes is enabled");
270 : } else {
271 0 : crm_debug("STONITH of failed nodes is disabled");
272 : }
273 :
274 0 : scheduler->stonith_action = pcmk__cluster_option(config_hash,
275 : PCMK_OPT_STONITH_ACTION);
276 0 : if (!strcmp(scheduler->stonith_action, PCMK__ACTION_POWEROFF)) {
277 0 : pcmk__warn_once(pcmk__wo_poweroff,
278 : "Support for " PCMK_OPT_STONITH_ACTION " of "
279 : "'" PCMK__ACTION_POWEROFF "' is deprecated and will be "
280 : "removed in a future release "
281 : "(use '" PCMK_ACTION_OFF "' instead)");
282 0 : scheduler->stonith_action = PCMK_ACTION_OFF;
283 : }
284 0 : crm_trace("STONITH will %s nodes", scheduler->stonith_action);
285 :
286 0 : set_config_flag(scheduler, PCMK_OPT_CONCURRENT_FENCING,
287 : pcmk_sched_concurrent_fencing);
288 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_concurrent_fencing)) {
289 0 : crm_debug("Concurrent fencing is enabled");
290 : } else {
291 0 : crm_debug("Concurrent fencing is disabled");
292 : }
293 :
294 0 : value = pcmk__cluster_option(config_hash, PCMK_OPT_PRIORITY_FENCING_DELAY);
295 0 : if (value) {
296 0 : pcmk_parse_interval_spec(value, &interval_ms);
297 0 : scheduler->priority_fencing_delay = (int) (interval_ms / 1000);
298 0 : crm_trace("Priority fencing delay is %ds",
299 : scheduler->priority_fencing_delay);
300 : }
301 :
302 0 : set_config_flag(scheduler, PCMK_OPT_STOP_ALL_RESOURCES,
303 : pcmk_sched_stop_all);
304 0 : crm_debug("Stop all active resources: %s",
305 : pcmk__flag_text(scheduler->flags, pcmk_sched_stop_all));
306 :
307 0 : set_config_flag(scheduler, PCMK_OPT_SYMMETRIC_CLUSTER,
308 : pcmk_sched_symmetric_cluster);
309 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_symmetric_cluster)) {
310 0 : crm_debug("Cluster is symmetric" " - resources can run anywhere by default");
311 : }
312 :
313 0 : value = pcmk__cluster_option(config_hash, PCMK_OPT_NO_QUORUM_POLICY);
314 :
315 0 : if (pcmk__str_eq(value, PCMK_VALUE_IGNORE, pcmk__str_casei)) {
316 0 : scheduler->no_quorum_policy = pcmk_no_quorum_ignore;
317 :
318 0 : } else if (pcmk__str_eq(value, PCMK_VALUE_FREEZE, pcmk__str_casei)) {
319 0 : scheduler->no_quorum_policy = pcmk_no_quorum_freeze;
320 :
321 0 : } else if (pcmk__str_eq(value, PCMK_VALUE_DEMOTE, pcmk__str_casei)) {
322 0 : scheduler->no_quorum_policy = pcmk_no_quorum_demote;
323 :
324 0 : } else if (pcmk__str_eq(value, PCMK_VALUE_FENCE_LEGACY, pcmk__str_casei)) {
325 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
326 0 : int do_panic = 0;
327 :
328 0 : crm_element_value_int(scheduler->input, PCMK_XA_NO_QUORUM_PANIC,
329 : &do_panic);
330 0 : if (do_panic || pcmk_is_set(scheduler->flags, pcmk_sched_quorate)) {
331 0 : scheduler->no_quorum_policy = pcmk_no_quorum_fence;
332 : } else {
333 0 : crm_notice("Resetting " PCMK_OPT_NO_QUORUM_POLICY
334 : " to 'stop': cluster has never had quorum");
335 0 : scheduler->no_quorum_policy = pcmk_no_quorum_stop;
336 : }
337 : } else {
338 0 : pcmk__config_err("Resetting " PCMK_OPT_NO_QUORUM_POLICY
339 : " to 'stop' because fencing is disabled");
340 0 : scheduler->no_quorum_policy = pcmk_no_quorum_stop;
341 : }
342 :
343 : } else {
344 0 : scheduler->no_quorum_policy = pcmk_no_quorum_stop;
345 : }
346 :
347 0 : switch (scheduler->no_quorum_policy) {
348 0 : case pcmk_no_quorum_freeze:
349 0 : crm_debug("On loss of quorum: Freeze resources");
350 0 : break;
351 0 : case pcmk_no_quorum_stop:
352 0 : crm_debug("On loss of quorum: Stop ALL resources");
353 0 : break;
354 0 : case pcmk_no_quorum_demote:
355 0 : crm_debug("On loss of quorum: "
356 : "Demote promotable resources and stop other resources");
357 0 : break;
358 0 : case pcmk_no_quorum_fence:
359 0 : crm_notice("On loss of quorum: Fence all remaining nodes");
360 0 : break;
361 0 : case pcmk_no_quorum_ignore:
362 0 : crm_notice("On loss of quorum: Ignore");
363 0 : break;
364 : }
365 :
366 0 : set_config_flag(scheduler, PCMK_OPT_STOP_ORPHAN_RESOURCES,
367 : pcmk_sched_stop_removed_resources);
368 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_stop_removed_resources)) {
369 0 : crm_trace("Orphan resources are stopped");
370 : } else {
371 0 : crm_trace("Orphan resources are ignored");
372 : }
373 :
374 0 : set_config_flag(scheduler, PCMK_OPT_STOP_ORPHAN_ACTIONS,
375 : pcmk_sched_cancel_removed_actions);
376 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_cancel_removed_actions)) {
377 0 : crm_trace("Orphan resource actions are stopped");
378 : } else {
379 0 : crm_trace("Orphan resource actions are ignored");
380 : }
381 :
382 0 : value = pcmk__cluster_option(config_hash, PCMK__OPT_REMOVE_AFTER_STOP);
383 0 : if (value != NULL) {
384 0 : if (crm_is_true(value)) {
385 0 : pcmk__set_scheduler_flags(scheduler, pcmk_sched_remove_after_stop);
386 0 : pcmk__warn_once(pcmk__wo_remove_after,
387 : "Support for the " PCMK__OPT_REMOVE_AFTER_STOP
388 : " cluster property is deprecated and will be "
389 : "removed in a future release");
390 : } else {
391 0 : pcmk__clear_scheduler_flags(scheduler,
392 : pcmk_sched_remove_after_stop);
393 : }
394 : }
395 :
396 0 : set_config_flag(scheduler, PCMK_OPT_MAINTENANCE_MODE,
397 : pcmk_sched_in_maintenance);
398 0 : crm_trace("Maintenance mode: %s",
399 : pcmk__flag_text(scheduler->flags, pcmk_sched_in_maintenance));
400 :
401 0 : set_config_flag(scheduler, PCMK_OPT_START_FAILURE_IS_FATAL,
402 : pcmk_sched_start_failure_fatal);
403 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_start_failure_fatal)) {
404 0 : crm_trace("Start failures are always fatal");
405 : } else {
406 0 : crm_trace("Start failures are handled by failcount");
407 : }
408 :
409 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
410 0 : set_config_flag(scheduler, PCMK_OPT_STARTUP_FENCING,
411 : pcmk_sched_startup_fencing);
412 : }
413 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_startup_fencing)) {
414 0 : crm_trace("Unseen nodes will be fenced");
415 : } else {
416 0 : pcmk__warn_once(pcmk__wo_blind,
417 : "Blind faith: not fencing unseen nodes");
418 : }
419 :
420 0 : pe__unpack_node_health_scores(scheduler);
421 :
422 0 : scheduler->placement_strategy =
423 0 : pcmk__cluster_option(config_hash, PCMK_OPT_PLACEMENT_STRATEGY);
424 0 : crm_trace("Placement strategy: %s", scheduler->placement_strategy);
425 :
426 0 : set_config_flag(scheduler, PCMK_OPT_SHUTDOWN_LOCK,
427 : pcmk_sched_shutdown_lock);
428 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_shutdown_lock)) {
429 0 : value = pcmk__cluster_option(config_hash, PCMK_OPT_SHUTDOWN_LOCK_LIMIT);
430 0 : pcmk_parse_interval_spec(value, &(scheduler->shutdown_lock));
431 0 : scheduler->shutdown_lock /= 1000;
432 0 : crm_trace("Resources will be locked to nodes that were cleanly "
433 : "shut down (locks expire after %s)",
434 : pcmk__readable_interval(scheduler->shutdown_lock));
435 : } else {
436 0 : crm_trace("Resources will not be locked to nodes that were cleanly "
437 : "shut down");
438 : }
439 :
440 0 : value = pcmk__cluster_option(config_hash, PCMK_OPT_NODE_PENDING_TIMEOUT);
441 0 : pcmk_parse_interval_spec(value, &(scheduler->node_pending_timeout));
442 0 : scheduler->node_pending_timeout /= 1000;
443 0 : if (scheduler->node_pending_timeout == 0) {
444 0 : crm_trace("Do not fence pending nodes");
445 : } else {
446 0 : crm_trace("Fence pending nodes after %s",
447 : pcmk__readable_interval(scheduler->node_pending_timeout
448 : * 1000));
449 : }
450 :
451 0 : return TRUE;
452 : }
453 :
454 : pcmk_node_t *
455 0 : pe_create_node(const char *id, const char *uname, const char *type,
456 : const char *score, pcmk_scheduler_t *scheduler)
457 : {
458 0 : pcmk_node_t *new_node = NULL;
459 :
460 0 : if (pcmk_find_node(scheduler, uname) != NULL) {
461 0 : pcmk__config_warn("More than one node entry has name '%s'", uname);
462 : }
463 :
464 0 : new_node = calloc(1, sizeof(pcmk_node_t));
465 0 : if (new_node == NULL) {
466 0 : pcmk__sched_err("Could not allocate memory for node %s", uname);
467 0 : return NULL;
468 : }
469 :
470 0 : new_node->weight = char2score(score);
471 0 : new_node->details = calloc(1, sizeof(struct pe_node_shared_s));
472 :
473 0 : if (new_node->details == NULL) {
474 0 : free(new_node);
475 0 : pcmk__sched_err("Could not allocate memory for node %s", uname);
476 0 : return NULL;
477 : }
478 :
479 0 : crm_trace("Creating node for entry %s/%s", uname, id);
480 0 : new_node->details->id = id;
481 0 : new_node->details->uname = uname;
482 0 : new_node->details->online = FALSE;
483 0 : new_node->details->shutdown = FALSE;
484 0 : new_node->details->rsc_discovery_enabled = TRUE;
485 0 : new_node->details->running_rsc = NULL;
486 0 : new_node->details->data_set = scheduler;
487 :
488 0 : if (pcmk__str_eq(type, PCMK_VALUE_MEMBER,
489 : pcmk__str_null_matches|pcmk__str_casei)) {
490 0 : new_node->details->type = pcmk_node_variant_cluster;
491 :
492 0 : } else if (pcmk__str_eq(type, PCMK_VALUE_REMOTE, pcmk__str_casei)) {
493 0 : new_node->details->type = pcmk_node_variant_remote;
494 0 : pcmk__set_scheduler_flags(scheduler, pcmk_sched_have_remote_nodes);
495 :
496 : } else {
497 : /* @COMPAT 'ping' is the default for backward compatibility, but it
498 : * should be changed to 'member' at a compatibility break
499 : */
500 0 : if (!pcmk__str_eq(type, PCMK__VALUE_PING, pcmk__str_casei)) {
501 0 : pcmk__config_warn("Node %s has unrecognized type '%s', "
502 : "assuming '" PCMK__VALUE_PING "'",
503 : pcmk__s(uname, "without name"), type);
504 : }
505 0 : pcmk__warn_once(pcmk__wo_ping_node,
506 : "Support for nodes of type '" PCMK__VALUE_PING "' "
507 : "(such as %s) is deprecated and will be removed in a "
508 : "future release",
509 : pcmk__s(uname, "unnamed node"));
510 0 : new_node->details->type = node_ping;
511 : }
512 :
513 0 : new_node->details->attrs = pcmk__strkey_table(free, free);
514 :
515 0 : if (pcmk__is_pacemaker_remote_node(new_node)) {
516 0 : pcmk__insert_dup(new_node->details->attrs, CRM_ATTR_KIND, "remote");
517 : } else {
518 0 : pcmk__insert_dup(new_node->details->attrs, CRM_ATTR_KIND, "cluster");
519 : }
520 :
521 0 : new_node->details->utilization = pcmk__strkey_table(free, free);
522 0 : new_node->details->digest_cache = pcmk__strkey_table(free,
523 : pe__free_digests);
524 :
525 0 : scheduler->nodes = g_list_insert_sorted(scheduler->nodes, new_node,
526 : pe__cmp_node_name);
527 0 : return new_node;
528 : }
529 :
530 : static const char *
531 0 : expand_remote_rsc_meta(xmlNode *xml_obj, xmlNode *parent, pcmk_scheduler_t *data)
532 : {
533 0 : xmlNode *attr_set = NULL;
534 0 : xmlNode *attr = NULL;
535 :
536 0 : const char *container_id = pcmk__xe_id(xml_obj);
537 0 : const char *remote_name = NULL;
538 0 : const char *remote_server = NULL;
539 0 : const char *remote_port = NULL;
540 0 : const char *connect_timeout = "60s";
541 0 : const char *remote_allow_migrate=NULL;
542 0 : const char *is_managed = NULL;
543 :
544 0 : for (attr_set = pcmk__xe_first_child(xml_obj, NULL, NULL, NULL);
545 0 : attr_set != NULL; attr_set = pcmk__xe_next(attr_set)) {
546 :
547 0 : if (!pcmk__xe_is(attr_set, PCMK_XE_META_ATTRIBUTES)) {
548 0 : continue;
549 : }
550 :
551 0 : for (attr = pcmk__xe_first_child(attr_set, NULL, NULL, NULL);
552 0 : attr != NULL; attr = pcmk__xe_next(attr)) {
553 :
554 0 : const char *value = crm_element_value(attr, PCMK_XA_VALUE);
555 0 : const char *name = crm_element_value(attr, PCMK_XA_NAME);
556 :
557 0 : if (name == NULL) { // Sanity
558 0 : continue;
559 : }
560 :
561 0 : if (strcmp(name, PCMK_META_REMOTE_NODE) == 0) {
562 0 : remote_name = value;
563 :
564 0 : } else if (strcmp(name, PCMK_META_REMOTE_ADDR) == 0) {
565 0 : remote_server = value;
566 :
567 0 : } else if (strcmp(name, PCMK_META_REMOTE_PORT) == 0) {
568 0 : remote_port = value;
569 :
570 0 : } else if (strcmp(name, PCMK_META_REMOTE_CONNECT_TIMEOUT) == 0) {
571 0 : connect_timeout = value;
572 :
573 0 : } else if (strcmp(name, PCMK_META_REMOTE_ALLOW_MIGRATE) == 0) {
574 0 : remote_allow_migrate = value;
575 :
576 0 : } else if (strcmp(name, PCMK_META_IS_MANAGED) == 0) {
577 0 : is_managed = value;
578 : }
579 : }
580 : }
581 :
582 0 : if (remote_name == NULL) {
583 0 : return NULL;
584 : }
585 :
586 0 : if (pe_find_resource(data->resources, remote_name) != NULL) {
587 0 : return NULL;
588 : }
589 :
590 0 : pe_create_remote_xml(parent, remote_name, container_id,
591 : remote_allow_migrate, is_managed,
592 : connect_timeout, remote_server, remote_port);
593 0 : return remote_name;
594 : }
595 :
596 : static void
597 0 : handle_startup_fencing(pcmk_scheduler_t *scheduler, pcmk_node_t *new_node)
598 : {
599 0 : if ((new_node->details->type == pcmk_node_variant_remote)
600 0 : && (new_node->details->remote_rsc == NULL)) {
601 : /* Ignore fencing for remote nodes that don't have a connection resource
602 : * associated with them. This happens when remote node entries get left
603 : * in the nodes section after the connection resource is removed.
604 : */
605 0 : return;
606 : }
607 :
608 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_startup_fencing)) {
609 : // All nodes are unclean until we've seen their status entry
610 0 : new_node->details->unclean = TRUE;
611 :
612 : } else {
613 : // Blind faith ...
614 0 : new_node->details->unclean = FALSE;
615 : }
616 :
617 : /* We need to be able to determine if a node's status section
618 : * exists or not separate from whether the node is unclean. */
619 0 : new_node->details->unseen = TRUE;
620 : }
621 :
622 : gboolean
623 0 : unpack_nodes(xmlNode *xml_nodes, pcmk_scheduler_t *scheduler)
624 : {
625 0 : xmlNode *xml_obj = NULL;
626 0 : pcmk_node_t *new_node = NULL;
627 0 : const char *id = NULL;
628 0 : const char *uname = NULL;
629 0 : const char *type = NULL;
630 0 : const char *score = NULL;
631 :
632 0 : for (xml_obj = pcmk__xe_first_child(xml_nodes, NULL, NULL, NULL);
633 0 : xml_obj != NULL; xml_obj = pcmk__xe_next(xml_obj)) {
634 :
635 0 : if (pcmk__xe_is(xml_obj, PCMK_XE_NODE)) {
636 0 : new_node = NULL;
637 :
638 0 : id = crm_element_value(xml_obj, PCMK_XA_ID);
639 0 : uname = crm_element_value(xml_obj, PCMK_XA_UNAME);
640 0 : type = crm_element_value(xml_obj, PCMK_XA_TYPE);
641 0 : score = crm_element_value(xml_obj, PCMK_XA_SCORE);
642 0 : crm_trace("Processing node %s/%s", uname, id);
643 :
644 0 : if (id == NULL) {
645 0 : pcmk__config_err("Ignoring <" PCMK_XE_NODE
646 : "> entry in configuration without id");
647 0 : continue;
648 : }
649 0 : new_node = pe_create_node(id, uname, type, score, scheduler);
650 :
651 0 : if (new_node == NULL) {
652 0 : return FALSE;
653 : }
654 :
655 0 : handle_startup_fencing(scheduler, new_node);
656 :
657 0 : add_node_attrs(xml_obj, new_node, FALSE, scheduler);
658 :
659 0 : crm_trace("Done with node %s",
660 : crm_element_value(xml_obj, PCMK_XA_UNAME));
661 : }
662 : }
663 :
664 0 : if (scheduler->localhost
665 0 : && (pcmk_find_node(scheduler, scheduler->localhost) == NULL)) {
666 0 : crm_info("Creating a fake local node");
667 0 : pe_create_node(scheduler->localhost, scheduler->localhost, NULL, 0,
668 : scheduler);
669 : }
670 :
671 0 : return TRUE;
672 : }
673 :
674 : static void
675 0 : setup_container(pcmk_resource_t *rsc, pcmk_scheduler_t *scheduler)
676 : {
677 0 : const char *container_id = NULL;
678 :
679 0 : if (rsc->children) {
680 0 : g_list_foreach(rsc->children, (GFunc) setup_container, scheduler);
681 0 : return;
682 : }
683 :
684 0 : container_id = g_hash_table_lookup(rsc->meta, PCMK__META_CONTAINER);
685 0 : if (container_id && !pcmk__str_eq(container_id, rsc->id, pcmk__str_casei)) {
686 0 : pcmk_resource_t *container = pe_find_resource(scheduler->resources,
687 : container_id);
688 :
689 0 : if (container) {
690 0 : rsc->container = container;
691 0 : pcmk__set_rsc_flags(container, pcmk_rsc_has_filler);
692 0 : container->fillers = g_list_append(container->fillers, rsc);
693 0 : pcmk__rsc_trace(rsc, "Resource %s's container is %s",
694 : rsc->id, container_id);
695 : } else {
696 0 : pcmk__config_err("Resource %s: Unknown resource container (%s)",
697 : rsc->id, container_id);
698 : }
699 : }
700 : }
701 :
702 : gboolean
703 0 : unpack_remote_nodes(xmlNode *xml_resources, pcmk_scheduler_t *scheduler)
704 : {
705 0 : xmlNode *xml_obj = NULL;
706 :
707 : /* Create remote nodes and guest nodes from the resource configuration
708 : * before unpacking resources.
709 : */
710 0 : for (xml_obj = pcmk__xe_first_child(xml_resources, NULL, NULL, NULL);
711 0 : xml_obj != NULL; xml_obj = pcmk__xe_next(xml_obj)) {
712 :
713 0 : const char *new_node_id = NULL;
714 :
715 : /* Check for remote nodes, which are defined by ocf:pacemaker:remote
716 : * primitives.
717 : */
718 0 : if (xml_contains_remote_node(xml_obj)) {
719 0 : new_node_id = pcmk__xe_id(xml_obj);
720 : /* The pcmk_find_node() check ensures we don't iterate over an
721 : * expanded node that has already been added to the node list
722 : */
723 0 : if (new_node_id
724 0 : && (pcmk_find_node(scheduler, new_node_id) == NULL)) {
725 0 : crm_trace("Found remote node %s defined by resource %s",
726 : new_node_id, pcmk__xe_id(xml_obj));
727 0 : pe_create_node(new_node_id, new_node_id, PCMK_VALUE_REMOTE,
728 : NULL, scheduler);
729 : }
730 0 : continue;
731 : }
732 :
733 : /* Check for guest nodes, which are defined by special meta-attributes
734 : * of a primitive of any type (for example, VirtualDomain or Xen).
735 : */
736 0 : if (pcmk__xe_is(xml_obj, PCMK_XE_PRIMITIVE)) {
737 : /* This will add an ocf:pacemaker:remote primitive to the
738 : * configuration for the guest node's connection, to be unpacked
739 : * later.
740 : */
741 0 : new_node_id = expand_remote_rsc_meta(xml_obj, xml_resources,
742 : scheduler);
743 0 : if (new_node_id
744 0 : && (pcmk_find_node(scheduler, new_node_id) == NULL)) {
745 0 : crm_trace("Found guest node %s in resource %s",
746 : new_node_id, pcmk__xe_id(xml_obj));
747 0 : pe_create_node(new_node_id, new_node_id, PCMK_VALUE_REMOTE,
748 : NULL, scheduler);
749 : }
750 0 : continue;
751 : }
752 :
753 : /* Check for guest nodes inside a group. Clones are currently not
754 : * supported as guest nodes.
755 : */
756 0 : if (pcmk__xe_is(xml_obj, PCMK_XE_GROUP)) {
757 0 : xmlNode *xml_obj2 = NULL;
758 0 : for (xml_obj2 = pcmk__xe_first_child(xml_obj, NULL, NULL, NULL);
759 0 : xml_obj2 != NULL; xml_obj2 = pcmk__xe_next(xml_obj2)) {
760 :
761 0 : new_node_id = expand_remote_rsc_meta(xml_obj2, xml_resources,
762 : scheduler);
763 :
764 0 : if (new_node_id
765 0 : && (pcmk_find_node(scheduler, new_node_id) == NULL)) {
766 0 : crm_trace("Found guest node %s in resource %s inside group %s",
767 : new_node_id, pcmk__xe_id(xml_obj2),
768 : pcmk__xe_id(xml_obj));
769 0 : pe_create_node(new_node_id, new_node_id, PCMK_VALUE_REMOTE,
770 : NULL, scheduler);
771 : }
772 : }
773 : }
774 : }
775 0 : return TRUE;
776 : }
777 :
778 : /* Call this after all the nodes and resources have been
779 : * unpacked, but before the status section is read.
780 : *
781 : * A remote node's online status is reflected by the state
782 : * of the remote node's connection resource. We need to link
783 : * the remote node to this connection resource so we can have
784 : * easy access to the connection resource during the scheduler calculations.
785 : */
786 : static void
787 0 : link_rsc2remotenode(pcmk_scheduler_t *scheduler, pcmk_resource_t *new_rsc)
788 : {
789 0 : pcmk_node_t *remote_node = NULL;
790 :
791 0 : if (new_rsc->is_remote_node == FALSE) {
792 0 : return;
793 : }
794 :
795 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_location_only)) {
796 : /* remote_nodes and remote_resources are not linked in quick location calculations */
797 0 : return;
798 : }
799 :
800 0 : remote_node = pcmk_find_node(scheduler, new_rsc->id);
801 0 : CRM_CHECK(remote_node != NULL, return);
802 :
803 0 : pcmk__rsc_trace(new_rsc, "Linking remote connection resource %s to %s",
804 : new_rsc->id, pcmk__node_name(remote_node));
805 0 : remote_node->details->remote_rsc = new_rsc;
806 :
807 0 : if (new_rsc->container == NULL) {
808 : /* Handle start-up fencing for remote nodes (as opposed to guest nodes)
809 : * the same as is done for cluster nodes.
810 : */
811 0 : handle_startup_fencing(scheduler, remote_node);
812 :
813 : } else {
814 : /* pe_create_node() marks the new node as "remote" or "cluster"; now
815 : * that we know the node is a guest node, update it correctly.
816 : */
817 0 : pcmk__insert_dup(remote_node->details->attrs,
818 : CRM_ATTR_KIND, "container");
819 : }
820 : }
821 :
822 : static void
823 0 : destroy_tag(gpointer data)
824 : {
825 0 : pcmk_tag_t *tag = data;
826 :
827 0 : if (tag) {
828 0 : free(tag->id);
829 0 : g_list_free_full(tag->refs, free);
830 0 : free(tag);
831 : }
832 0 : }
833 :
834 : /*!
835 : * \internal
836 : * \brief Parse configuration XML for resource information
837 : *
838 : * \param[in] xml_resources Top of resource configuration XML
839 : * \param[in,out] scheduler Scheduler data
840 : *
841 : * \return TRUE
842 : *
843 : * \note unpack_remote_nodes() MUST be called before this, so that the nodes can
844 : * be used when pe__unpack_resource() calls resource_location()
845 : */
846 : gboolean
847 0 : unpack_resources(const xmlNode *xml_resources, pcmk_scheduler_t *scheduler)
848 : {
849 0 : xmlNode *xml_obj = NULL;
850 0 : GList *gIter = NULL;
851 :
852 0 : scheduler->template_rsc_sets = pcmk__strkey_table(free, destroy_tag);
853 :
854 0 : for (xml_obj = pcmk__xe_first_child(xml_resources, NULL, NULL, NULL);
855 0 : xml_obj != NULL; xml_obj = pcmk__xe_next(xml_obj)) {
856 :
857 0 : pcmk_resource_t *new_rsc = NULL;
858 0 : const char *id = pcmk__xe_id(xml_obj);
859 :
860 0 : if (pcmk__str_empty(id)) {
861 0 : pcmk__config_err("Ignoring <%s> resource without ID",
862 : xml_obj->name);
863 0 : continue;
864 : }
865 :
866 0 : if (pcmk__xe_is(xml_obj, PCMK_XE_TEMPLATE)) {
867 0 : if (g_hash_table_lookup_extended(scheduler->template_rsc_sets, id,
868 : NULL, NULL) == FALSE) {
869 : /* Record the template's ID for the knowledge of its existence anyway. */
870 0 : pcmk__insert_dup(scheduler->template_rsc_sets, id, NULL);
871 : }
872 0 : continue;
873 : }
874 :
875 0 : crm_trace("Unpacking <%s " PCMK_XA_ID "='%s'>", xml_obj->name, id);
876 0 : if (pe__unpack_resource(xml_obj, &new_rsc, NULL,
877 : scheduler) == pcmk_rc_ok) {
878 0 : scheduler->resources = g_list_append(scheduler->resources, new_rsc);
879 0 : pcmk__rsc_trace(new_rsc, "Added resource %s", new_rsc->id);
880 :
881 : } else {
882 0 : pcmk__config_err("Ignoring <%s> resource '%s' "
883 : "because configuration is invalid",
884 : xml_obj->name, id);
885 : }
886 : }
887 :
888 0 : for (gIter = scheduler->resources; gIter != NULL; gIter = gIter->next) {
889 0 : pcmk_resource_t *rsc = (pcmk_resource_t *) gIter->data;
890 :
891 0 : setup_container(rsc, scheduler);
892 0 : link_rsc2remotenode(scheduler, rsc);
893 : }
894 :
895 0 : scheduler->resources = g_list_sort(scheduler->resources,
896 : pe__cmp_rsc_priority);
897 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_location_only)) {
898 : /* Ignore */
899 :
900 0 : } else if (pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)
901 0 : && !pcmk_is_set(scheduler->flags, pcmk_sched_have_fencing)) {
902 :
903 0 : pcmk__config_err("Resource start-up disabled since no STONITH resources have been defined");
904 0 : pcmk__config_err("Either configure some or disable STONITH with the "
905 : PCMK_OPT_STONITH_ENABLED " option");
906 0 : pcmk__config_err("NOTE: Clusters with shared data need STONITH to ensure data integrity");
907 : }
908 :
909 0 : return TRUE;
910 : }
911 :
912 : gboolean
913 0 : unpack_tags(xmlNode *xml_tags, pcmk_scheduler_t *scheduler)
914 : {
915 0 : xmlNode *xml_tag = NULL;
916 :
917 0 : scheduler->tags = pcmk__strkey_table(free, destroy_tag);
918 :
919 0 : for (xml_tag = pcmk__xe_first_child(xml_tags, NULL, NULL, NULL);
920 0 : xml_tag != NULL; xml_tag = pcmk__xe_next(xml_tag)) {
921 :
922 0 : xmlNode *xml_obj_ref = NULL;
923 0 : const char *tag_id = pcmk__xe_id(xml_tag);
924 :
925 0 : if (!pcmk__xe_is(xml_tag, PCMK_XE_TAG)) {
926 0 : continue;
927 : }
928 :
929 0 : if (tag_id == NULL) {
930 0 : pcmk__config_err("Ignoring <%s> without " PCMK_XA_ID,
931 : (const char *) xml_tag->name);
932 0 : continue;
933 : }
934 :
935 0 : for (xml_obj_ref = pcmk__xe_first_child(xml_tag, NULL, NULL, NULL);
936 0 : xml_obj_ref != NULL; xml_obj_ref = pcmk__xe_next(xml_obj_ref)) {
937 :
938 0 : const char *obj_ref = pcmk__xe_id(xml_obj_ref);
939 :
940 0 : if (!pcmk__xe_is(xml_obj_ref, PCMK_XE_OBJ_REF)) {
941 0 : continue;
942 : }
943 :
944 0 : if (obj_ref == NULL) {
945 0 : pcmk__config_err("Ignoring <%s> for tag '%s' without " PCMK_XA_ID,
946 : xml_obj_ref->name, tag_id);
947 0 : continue;
948 : }
949 :
950 0 : if (add_tag_ref(scheduler->tags, tag_id, obj_ref) == FALSE) {
951 0 : return FALSE;
952 : }
953 : }
954 : }
955 :
956 0 : return TRUE;
957 : }
958 :
959 : /* The ticket state section:
960 : * "/cib/status/tickets/ticket_state" */
961 : static gboolean
962 0 : unpack_ticket_state(xmlNode *xml_ticket, pcmk_scheduler_t *scheduler)
963 : {
964 0 : const char *ticket_id = NULL;
965 0 : const char *granted = NULL;
966 0 : const char *last_granted = NULL;
967 0 : const char *standby = NULL;
968 0 : xmlAttrPtr xIter = NULL;
969 :
970 0 : pcmk_ticket_t *ticket = NULL;
971 :
972 0 : ticket_id = pcmk__xe_id(xml_ticket);
973 0 : if (pcmk__str_empty(ticket_id)) {
974 0 : return FALSE;
975 : }
976 :
977 0 : crm_trace("Processing ticket state for %s", ticket_id);
978 :
979 0 : ticket = g_hash_table_lookup(scheduler->tickets, ticket_id);
980 0 : if (ticket == NULL) {
981 0 : ticket = ticket_new(ticket_id, scheduler);
982 0 : if (ticket == NULL) {
983 0 : return FALSE;
984 : }
985 : }
986 :
987 0 : for (xIter = xml_ticket->properties; xIter; xIter = xIter->next) {
988 0 : const char *prop_name = (const char *)xIter->name;
989 0 : const char *prop_value = pcmk__xml_attr_value(xIter);
990 :
991 0 : if (pcmk__str_eq(prop_name, PCMK_XA_ID, pcmk__str_none)) {
992 0 : continue;
993 : }
994 0 : pcmk__insert_dup(ticket->state, prop_name, prop_value);
995 : }
996 :
997 0 : granted = g_hash_table_lookup(ticket->state, PCMK__XA_GRANTED);
998 0 : if (granted && crm_is_true(granted)) {
999 0 : ticket->granted = TRUE;
1000 0 : crm_info("We have ticket '%s'", ticket->id);
1001 : } else {
1002 0 : ticket->granted = FALSE;
1003 0 : crm_info("We do not have ticket '%s'", ticket->id);
1004 : }
1005 :
1006 0 : last_granted = g_hash_table_lookup(ticket->state, PCMK_XA_LAST_GRANTED);
1007 0 : if (last_granted) {
1008 : long long last_granted_ll;
1009 :
1010 0 : pcmk__scan_ll(last_granted, &last_granted_ll, 0LL);
1011 0 : ticket->last_granted = (time_t) last_granted_ll;
1012 : }
1013 :
1014 0 : standby = g_hash_table_lookup(ticket->state, PCMK_XA_STANDBY);
1015 0 : if (standby && crm_is_true(standby)) {
1016 0 : ticket->standby = TRUE;
1017 0 : if (ticket->granted) {
1018 0 : crm_info("Granted ticket '%s' is in standby-mode", ticket->id);
1019 : }
1020 : } else {
1021 0 : ticket->standby = FALSE;
1022 : }
1023 :
1024 0 : crm_trace("Done with ticket state for %s", ticket_id);
1025 :
1026 0 : return TRUE;
1027 : }
1028 :
1029 : static gboolean
1030 0 : unpack_tickets_state(xmlNode *xml_tickets, pcmk_scheduler_t *scheduler)
1031 : {
1032 0 : xmlNode *xml_obj = NULL;
1033 :
1034 0 : for (xml_obj = pcmk__xe_first_child(xml_tickets, NULL, NULL, NULL);
1035 0 : xml_obj != NULL; xml_obj = pcmk__xe_next(xml_obj)) {
1036 :
1037 0 : if (!pcmk__xe_is(xml_obj, PCMK__XE_TICKET_STATE)) {
1038 0 : continue;
1039 : }
1040 0 : unpack_ticket_state(xml_obj, scheduler);
1041 : }
1042 :
1043 0 : return TRUE;
1044 : }
1045 :
1046 : static void
1047 0 : unpack_handle_remote_attrs(pcmk_node_t *this_node, const xmlNode *state,
1048 : pcmk_scheduler_t *scheduler)
1049 : {
1050 0 : const char *discovery = NULL;
1051 0 : const xmlNode *attrs = NULL;
1052 0 : pcmk_resource_t *rsc = NULL;
1053 :
1054 0 : if (!pcmk__xe_is(state, PCMK__XE_NODE_STATE)) {
1055 0 : return;
1056 : }
1057 :
1058 0 : if ((this_node == NULL) || !pcmk__is_pacemaker_remote_node(this_node)) {
1059 0 : return;
1060 : }
1061 0 : crm_trace("Processing Pacemaker Remote node %s",
1062 : pcmk__node_name(this_node));
1063 :
1064 0 : pcmk__scan_min_int(crm_element_value(state, PCMK__XA_NODE_IN_MAINTENANCE),
1065 0 : &(this_node->details->remote_maintenance), 0);
1066 :
1067 0 : rsc = this_node->details->remote_rsc;
1068 0 : if (this_node->details->remote_requires_reset == FALSE) {
1069 0 : this_node->details->unclean = FALSE;
1070 0 : this_node->details->unseen = FALSE;
1071 : }
1072 0 : attrs = pcmk__xe_first_child(state, PCMK__XE_TRANSIENT_ATTRIBUTES, NULL,
1073 : NULL);
1074 0 : add_node_attrs(attrs, this_node, TRUE, scheduler);
1075 :
1076 0 : if (pe__shutdown_requested(this_node)) {
1077 0 : crm_info("%s is shutting down", pcmk__node_name(this_node));
1078 0 : this_node->details->shutdown = TRUE;
1079 : }
1080 :
1081 0 : if (crm_is_true(pcmk__node_attr(this_node, PCMK_NODE_ATTR_STANDBY, NULL,
1082 : pcmk__rsc_node_current))) {
1083 0 : crm_info("%s is in standby mode", pcmk__node_name(this_node));
1084 0 : this_node->details->standby = TRUE;
1085 : }
1086 :
1087 0 : if (crm_is_true(pcmk__node_attr(this_node, PCMK_NODE_ATTR_MAINTENANCE, NULL,
1088 : pcmk__rsc_node_current))
1089 0 : || ((rsc != NULL) && !pcmk_is_set(rsc->flags, pcmk_rsc_managed))) {
1090 0 : crm_info("%s is in maintenance mode", pcmk__node_name(this_node));
1091 0 : this_node->details->maintenance = TRUE;
1092 : }
1093 :
1094 0 : discovery = pcmk__node_attr(this_node,
1095 : PCMK__NODE_ATTR_RESOURCE_DISCOVERY_ENABLED,
1096 : NULL, pcmk__rsc_node_current);
1097 0 : if ((discovery != NULL) && !crm_is_true(discovery)) {
1098 0 : pcmk__warn_once(pcmk__wo_rdisc_enabled,
1099 : "Support for the "
1100 : PCMK__NODE_ATTR_RESOURCE_DISCOVERY_ENABLED
1101 : " node attribute is deprecated and will be removed"
1102 : " (and behave as 'true') in a future release.");
1103 :
1104 0 : if (pcmk__is_remote_node(this_node)
1105 0 : && !pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
1106 0 : pcmk__config_warn("Ignoring "
1107 : PCMK__NODE_ATTR_RESOURCE_DISCOVERY_ENABLED
1108 : " attribute on Pacemaker Remote node %s"
1109 : " because fencing is disabled",
1110 : pcmk__node_name(this_node));
1111 : } else {
1112 : /* This is either a remote node with fencing enabled, or a guest
1113 : * node. We don't care whether fencing is enabled when fencing guest
1114 : * nodes, because they are "fenced" by recovering their containing
1115 : * resource.
1116 : */
1117 0 : crm_info("%s has resource discovery disabled",
1118 : pcmk__node_name(this_node));
1119 0 : this_node->details->rsc_discovery_enabled = FALSE;
1120 : }
1121 : }
1122 : }
1123 :
1124 : /*!
1125 : * \internal
1126 : * \brief Unpack a cluster node's transient attributes
1127 : *
1128 : * \param[in] state CIB node state XML
1129 : * \param[in,out] node Cluster node whose attributes are being unpacked
1130 : * \param[in,out] scheduler Scheduler data
1131 : */
1132 : static void
1133 0 : unpack_transient_attributes(const xmlNode *state, pcmk_node_t *node,
1134 : pcmk_scheduler_t *scheduler)
1135 : {
1136 0 : const char *discovery = NULL;
1137 0 : const xmlNode *attrs = pcmk__xe_first_child(state,
1138 : PCMK__XE_TRANSIENT_ATTRIBUTES,
1139 : NULL, NULL);
1140 :
1141 0 : add_node_attrs(attrs, node, TRUE, scheduler);
1142 :
1143 0 : if (crm_is_true(pcmk__node_attr(node, PCMK_NODE_ATTR_STANDBY, NULL,
1144 : pcmk__rsc_node_current))) {
1145 0 : crm_info("%s is in standby mode", pcmk__node_name(node));
1146 0 : node->details->standby = TRUE;
1147 : }
1148 :
1149 0 : if (crm_is_true(pcmk__node_attr(node, PCMK_NODE_ATTR_MAINTENANCE, NULL,
1150 : pcmk__rsc_node_current))) {
1151 0 : crm_info("%s is in maintenance mode", pcmk__node_name(node));
1152 0 : node->details->maintenance = TRUE;
1153 : }
1154 :
1155 0 : discovery = pcmk__node_attr(node,
1156 : PCMK__NODE_ATTR_RESOURCE_DISCOVERY_ENABLED,
1157 : NULL, pcmk__rsc_node_current);
1158 0 : if ((discovery != NULL) && !crm_is_true(discovery)) {
1159 0 : pcmk__config_warn("Ignoring "
1160 : PCMK__NODE_ATTR_RESOURCE_DISCOVERY_ENABLED
1161 : " attribute for %s because disabling resource"
1162 : " discovery is not allowed for cluster nodes",
1163 : pcmk__node_name(node));
1164 : }
1165 0 : }
1166 :
1167 : /*!
1168 : * \internal
1169 : * \brief Unpack a node state entry (first pass)
1170 : *
1171 : * Unpack one node state entry from status. This unpacks information from the
1172 : * \C PCMK__XE_NODE_STATE element itself and node attributes inside it, but not
1173 : * the resource history inside it. Multiple passes through the status are needed
1174 : * to fully unpack everything.
1175 : *
1176 : * \param[in] state CIB node state XML
1177 : * \param[in,out] scheduler Scheduler data
1178 : */
1179 : static void
1180 0 : unpack_node_state(const xmlNode *state, pcmk_scheduler_t *scheduler)
1181 : {
1182 0 : const char *id = NULL;
1183 0 : const char *uname = NULL;
1184 0 : pcmk_node_t *this_node = NULL;
1185 :
1186 0 : id = crm_element_value(state, PCMK_XA_ID);
1187 0 : if (id == NULL) {
1188 0 : pcmk__config_err("Ignoring invalid " PCMK__XE_NODE_STATE " entry without "
1189 : PCMK_XA_ID);
1190 0 : crm_log_xml_info(state, "missing-id");
1191 0 : return;
1192 : }
1193 :
1194 0 : uname = crm_element_value(state, PCMK_XA_UNAME);
1195 0 : if (uname == NULL) {
1196 : /* If a joining peer makes the cluster acquire the quorum from corosync
1197 : * meanwhile it has not joined CPG membership of pacemaker-controld yet,
1198 : * it's possible that the created PCMK__XE_NODE_STATE entry doesn't have
1199 : * a PCMK_XA_UNAME yet. We should recognize the node as `pending` and
1200 : * wait for it to join CPG.
1201 : */
1202 0 : crm_trace("Handling " PCMK__XE_NODE_STATE " entry with id=\"%s\" "
1203 : "without " PCMK_XA_UNAME,
1204 : id);
1205 : }
1206 :
1207 0 : this_node = pe_find_node_any(scheduler->nodes, id, uname);
1208 0 : if (this_node == NULL) {
1209 0 : crm_notice("Ignoring recorded state for removed node with name %s and "
1210 : PCMK_XA_ID " %s", pcmk__s(uname, "unknown"), id);
1211 0 : return;
1212 : }
1213 :
1214 0 : if (pcmk__is_pacemaker_remote_node(this_node)) {
1215 : /* We can't determine the online status of Pacemaker Remote nodes until
1216 : * after all resource history has been unpacked. In this first pass, we
1217 : * do need to mark whether the node has been fenced, as this plays a
1218 : * role during unpacking cluster node resource state.
1219 : */
1220 0 : pcmk__scan_min_int(crm_element_value(state, PCMK__XA_NODE_FENCED),
1221 0 : &(this_node->details->remote_was_fenced), 0);
1222 0 : return;
1223 : }
1224 :
1225 0 : unpack_transient_attributes(state, this_node, scheduler);
1226 :
1227 : /* Provisionally mark this cluster node as clean. We have at least seen it
1228 : * in the current cluster's lifetime.
1229 : */
1230 0 : this_node->details->unclean = FALSE;
1231 0 : this_node->details->unseen = FALSE;
1232 :
1233 0 : crm_trace("Determining online status of cluster node %s (id %s)",
1234 : pcmk__node_name(this_node), id);
1235 0 : determine_online_status(state, this_node, scheduler);
1236 :
1237 0 : if (!pcmk_is_set(scheduler->flags, pcmk_sched_quorate)
1238 0 : && this_node->details->online
1239 0 : && (scheduler->no_quorum_policy == pcmk_no_quorum_fence)) {
1240 : /* Everything else should flow from this automatically
1241 : * (at least until the scheduler becomes able to migrate off
1242 : * healthy resources)
1243 : */
1244 0 : pe_fence_node(scheduler, this_node, "cluster does not have quorum",
1245 : FALSE);
1246 : }
1247 : }
1248 :
1249 : /*!
1250 : * \internal
1251 : * \brief Unpack nodes' resource history as much as possible
1252 : *
1253 : * Unpack as many nodes' resource history as possible in one pass through the
1254 : * status. We need to process Pacemaker Remote nodes' connections/containers
1255 : * before unpacking their history; the connection/container history will be
1256 : * in another node's history, so it might take multiple passes to unpack
1257 : * everything.
1258 : *
1259 : * \param[in] status CIB XML status section
1260 : * \param[in] fence If true, treat any not-yet-unpacked nodes as unseen
1261 : * \param[in,out] scheduler Scheduler data
1262 : *
1263 : * \return Standard Pacemaker return code (specifically pcmk_rc_ok if done,
1264 : * or EAGAIN if more unpacking remains to be done)
1265 : */
1266 : static int
1267 0 : unpack_node_history(const xmlNode *status, bool fence,
1268 : pcmk_scheduler_t *scheduler)
1269 : {
1270 0 : int rc = pcmk_rc_ok;
1271 :
1272 : // Loop through all PCMK__XE_NODE_STATE entries in CIB status
1273 0 : for (const xmlNode *state = pcmk__xe_first_child(status,
1274 : PCMK__XE_NODE_STATE, NULL,
1275 : NULL);
1276 0 : state != NULL; state = pcmk__xe_next_same(state)) {
1277 :
1278 0 : const char *id = pcmk__xe_id(state);
1279 0 : const char *uname = crm_element_value(state, PCMK_XA_UNAME);
1280 0 : pcmk_node_t *this_node = NULL;
1281 :
1282 0 : if ((id == NULL) || (uname == NULL)) {
1283 : // Warning already logged in first pass through status section
1284 0 : crm_trace("Not unpacking resource history from malformed "
1285 : PCMK__XE_NODE_STATE " without id and/or uname");
1286 0 : continue;
1287 : }
1288 :
1289 0 : this_node = pe_find_node_any(scheduler->nodes, id, uname);
1290 0 : if (this_node == NULL) {
1291 : // Warning already logged in first pass through status section
1292 0 : crm_trace("Not unpacking resource history for node %s because "
1293 : "no longer in configuration", id);
1294 0 : continue;
1295 : }
1296 :
1297 0 : if (this_node->details->unpacked) {
1298 0 : crm_trace("Not unpacking resource history for node %s because "
1299 : "already unpacked", id);
1300 0 : continue;
1301 : }
1302 :
1303 0 : if (fence) {
1304 : // We're processing all remaining nodes
1305 :
1306 0 : } else if (pcmk__is_guest_or_bundle_node(this_node)) {
1307 : /* We can unpack a guest node's history only after we've unpacked
1308 : * other resource history to the point that we know that the node's
1309 : * connection and containing resource are both up.
1310 : */
1311 0 : pcmk_resource_t *rsc = this_node->details->remote_rsc;
1312 :
1313 0 : if ((rsc == NULL) || (rsc->role != pcmk_role_started)
1314 0 : || (rsc->container->role != pcmk_role_started)) {
1315 0 : crm_trace("Not unpacking resource history for guest node %s "
1316 : "because container and connection are not known to "
1317 : "be up", id);
1318 0 : continue;
1319 : }
1320 :
1321 0 : } else if (pcmk__is_remote_node(this_node)) {
1322 : /* We can unpack a remote node's history only after we've unpacked
1323 : * other resource history to the point that we know that the node's
1324 : * connection is up, with the exception of when shutdown locks are
1325 : * in use.
1326 : */
1327 0 : pcmk_resource_t *rsc = this_node->details->remote_rsc;
1328 :
1329 0 : if ((rsc == NULL)
1330 0 : || (!pcmk_is_set(scheduler->flags, pcmk_sched_shutdown_lock)
1331 0 : && (rsc->role != pcmk_role_started))) {
1332 0 : crm_trace("Not unpacking resource history for remote node %s "
1333 : "because connection is not known to be up", id);
1334 0 : continue;
1335 : }
1336 :
1337 : /* If fencing and shutdown locks are disabled and we're not processing
1338 : * unseen nodes, then we don't want to unpack offline nodes until online
1339 : * nodes have been unpacked. This allows us to number active clone
1340 : * instances first.
1341 : */
1342 0 : } else if (!pcmk_any_flags_set(scheduler->flags,
1343 : pcmk_sched_fencing_enabled
1344 : |pcmk_sched_shutdown_lock)
1345 0 : && !this_node->details->online) {
1346 0 : crm_trace("Not unpacking resource history for offline "
1347 : "cluster node %s", id);
1348 0 : continue;
1349 : }
1350 :
1351 0 : if (pcmk__is_pacemaker_remote_node(this_node)) {
1352 0 : determine_remote_online_status(scheduler, this_node);
1353 0 : unpack_handle_remote_attrs(this_node, state, scheduler);
1354 : }
1355 :
1356 0 : crm_trace("Unpacking resource history for %snode %s",
1357 : (fence? "unseen " : ""), id);
1358 :
1359 0 : this_node->details->unpacked = TRUE;
1360 0 : unpack_node_lrm(this_node, state, scheduler);
1361 :
1362 0 : rc = EAGAIN; // Other node histories might depend on this one
1363 : }
1364 0 : return rc;
1365 : }
1366 :
1367 : /* remove nodes that are down, stopping */
1368 : /* create positive rsc_to_node constraints between resources and the nodes they are running on */
1369 : /* anything else? */
1370 : gboolean
1371 0 : unpack_status(xmlNode *status, pcmk_scheduler_t *scheduler)
1372 : {
1373 0 : xmlNode *state = NULL;
1374 :
1375 0 : crm_trace("Beginning unpack");
1376 :
1377 0 : if (scheduler->tickets == NULL) {
1378 0 : scheduler->tickets = pcmk__strkey_table(free, destroy_ticket);
1379 : }
1380 :
1381 0 : for (state = pcmk__xe_first_child(status, NULL, NULL, NULL); state != NULL;
1382 0 : state = pcmk__xe_next(state)) {
1383 :
1384 0 : if (pcmk__xe_is(state, PCMK_XE_TICKETS)) {
1385 0 : unpack_tickets_state((xmlNode *) state, scheduler);
1386 :
1387 0 : } else if (pcmk__xe_is(state, PCMK__XE_NODE_STATE)) {
1388 0 : unpack_node_state(state, scheduler);
1389 : }
1390 : }
1391 :
1392 0 : while (unpack_node_history(status, FALSE, scheduler) == EAGAIN) {
1393 0 : crm_trace("Another pass through node resource histories is needed");
1394 : }
1395 :
1396 : // Now catch any nodes we didn't see
1397 0 : unpack_node_history(status,
1398 0 : pcmk_is_set(scheduler->flags,
1399 : pcmk_sched_fencing_enabled),
1400 : scheduler);
1401 :
1402 : /* Now that we know where resources are, we can schedule stops of containers
1403 : * with failed bundle connections
1404 : */
1405 0 : if (scheduler->stop_needed != NULL) {
1406 0 : for (GList *item = scheduler->stop_needed; item; item = item->next) {
1407 0 : pcmk_resource_t *container = item->data;
1408 0 : pcmk_node_t *node = pcmk__current_node(container);
1409 :
1410 0 : if (node) {
1411 0 : stop_action(container, node, FALSE);
1412 : }
1413 : }
1414 0 : g_list_free(scheduler->stop_needed);
1415 0 : scheduler->stop_needed = NULL;
1416 : }
1417 :
1418 : /* Now that we know status of all Pacemaker Remote connections and nodes,
1419 : * we can stop connections for node shutdowns, and check the online status
1420 : * of remote/guest nodes that didn't have any node history to unpack.
1421 : */
1422 0 : for (GList *gIter = scheduler->nodes; gIter != NULL; gIter = gIter->next) {
1423 0 : pcmk_node_t *this_node = gIter->data;
1424 :
1425 0 : if (!pcmk__is_pacemaker_remote_node(this_node)) {
1426 0 : continue;
1427 : }
1428 0 : if (this_node->details->shutdown
1429 0 : && (this_node->details->remote_rsc != NULL)) {
1430 0 : pe__set_next_role(this_node->details->remote_rsc, pcmk_role_stopped,
1431 : "remote shutdown");
1432 : }
1433 0 : if (!this_node->details->unpacked) {
1434 0 : determine_remote_online_status(scheduler, this_node);
1435 : }
1436 : }
1437 :
1438 0 : return TRUE;
1439 : }
1440 :
1441 : /*!
1442 : * \internal
1443 : * \brief Unpack node's time when it became a member at the cluster layer
1444 : *
1445 : * \param[in] node_state Node's \c PCMK__XE_NODE_STATE entry
1446 : * \param[in,out] scheduler Scheduler data
1447 : *
1448 : * \return Epoch time when node became a cluster member
1449 : * (or scheduler effective time for legacy entries) if a member,
1450 : * 0 if not a member, or -1 if no valid information available
1451 : */
1452 : static long long
1453 0 : unpack_node_member(const xmlNode *node_state, pcmk_scheduler_t *scheduler)
1454 : {
1455 0 : const char *member_time = crm_element_value(node_state, PCMK__XA_IN_CCM);
1456 0 : int member = 0;
1457 :
1458 0 : if (member_time == NULL) {
1459 0 : return -1LL;
1460 :
1461 0 : } else if (crm_str_to_boolean(member_time, &member) == 1) {
1462 : /* If in_ccm=0, we'll return 0 here. If in_ccm=1, either the entry was
1463 : * recorded as a boolean for a DC < 2.1.7, or the node is pending
1464 : * shutdown and has left the CPG, in which case it was set to 1 to avoid
1465 : * fencing for PCMK_OPT_NODE_PENDING_TIMEOUT.
1466 : *
1467 : * We return the effective time for in_ccm=1 because what's important to
1468 : * avoid fencing is that effective time minus this value is less than
1469 : * the pending node timeout.
1470 : */
1471 0 : return member? (long long) get_effective_time(scheduler) : 0LL;
1472 :
1473 : } else {
1474 0 : long long when_member = 0LL;
1475 :
1476 0 : if ((pcmk__scan_ll(member_time, &when_member,
1477 0 : 0LL) != pcmk_rc_ok) || (when_member < 0LL)) {
1478 0 : crm_warn("Unrecognized value '%s' for " PCMK__XA_IN_CCM
1479 : " in " PCMK__XE_NODE_STATE " entry", member_time);
1480 0 : return -1LL;
1481 : }
1482 0 : return when_member;
1483 : }
1484 : }
1485 :
1486 : /*!
1487 : * \internal
1488 : * \brief Unpack node's time when it became online in process group
1489 : *
1490 : * \param[in] node_state Node's \c PCMK__XE_NODE_STATE entry
1491 : *
1492 : * \return Epoch time when node became online in process group (or 0 if not
1493 : * online, or 1 for legacy online entries)
1494 : */
1495 : static long long
1496 0 : unpack_node_online(const xmlNode *node_state)
1497 : {
1498 0 : const char *peer_time = crm_element_value(node_state, PCMK_XA_CRMD);
1499 :
1500 : // @COMPAT Entries recorded for DCs < 2.1.7 have "online" or "offline"
1501 0 : if (pcmk__str_eq(peer_time, PCMK_VALUE_OFFLINE,
1502 : pcmk__str_casei|pcmk__str_null_matches)) {
1503 0 : return 0LL;
1504 :
1505 0 : } else if (pcmk__str_eq(peer_time, PCMK_VALUE_ONLINE, pcmk__str_casei)) {
1506 0 : return 1LL;
1507 :
1508 : } else {
1509 0 : long long when_online = 0LL;
1510 :
1511 0 : if ((pcmk__scan_ll(peer_time, &when_online, 0LL) != pcmk_rc_ok)
1512 0 : || (when_online < 0)) {
1513 0 : crm_warn("Unrecognized value '%s' for " PCMK_XA_CRMD " in "
1514 : PCMK__XE_NODE_STATE " entry, assuming offline", peer_time);
1515 0 : return 0LL;
1516 : }
1517 0 : return when_online;
1518 : }
1519 : }
1520 :
1521 : /*!
1522 : * \internal
1523 : * \brief Unpack node attribute for user-requested fencing
1524 : *
1525 : * \param[in] node Node to check
1526 : * \param[in] node_state Node's \c PCMK__XE_NODE_STATE entry in CIB status
1527 : *
1528 : * \return \c true if fencing has been requested for \p node, otherwise \c false
1529 : */
1530 : static bool
1531 0 : unpack_node_terminate(const pcmk_node_t *node, const xmlNode *node_state)
1532 : {
1533 0 : long long value = 0LL;
1534 0 : int value_i = 0;
1535 0 : const char *value_s = pcmk__node_attr(node, PCMK_NODE_ATTR_TERMINATE,
1536 : NULL, pcmk__rsc_node_current);
1537 :
1538 : // Value may be boolean or an epoch time
1539 0 : if (crm_str_to_boolean(value_s, &value_i) == 1) {
1540 0 : return (value_i != 0);
1541 : }
1542 0 : if (pcmk__scan_ll(value_s, &value, 0LL) == pcmk_rc_ok) {
1543 0 : return (value > 0);
1544 : }
1545 0 : crm_warn("Ignoring unrecognized value '%s' for " PCMK_NODE_ATTR_TERMINATE
1546 : "node attribute for %s", value_s, pcmk__node_name(node));
1547 0 : return false;
1548 : }
1549 :
1550 : static gboolean
1551 0 : determine_online_status_no_fencing(pcmk_scheduler_t *scheduler,
1552 : const xmlNode *node_state,
1553 : pcmk_node_t *this_node)
1554 : {
1555 0 : gboolean online = FALSE;
1556 0 : const char *join = crm_element_value(node_state, PCMK__XA_JOIN);
1557 0 : const char *exp_state = crm_element_value(node_state, PCMK_XA_EXPECTED);
1558 0 : long long when_member = unpack_node_member(node_state, scheduler);
1559 0 : long long when_online = unpack_node_online(node_state);
1560 :
1561 0 : if (when_member <= 0) {
1562 0 : crm_trace("Node %s is %sdown", pcmk__node_name(this_node),
1563 : ((when_member < 0)? "presumed " : ""));
1564 :
1565 0 : } else if (when_online > 0) {
1566 0 : if (pcmk__str_eq(join, CRMD_JOINSTATE_MEMBER, pcmk__str_casei)) {
1567 0 : online = TRUE;
1568 : } else {
1569 0 : crm_debug("Node %s is not ready to run resources: %s",
1570 : pcmk__node_name(this_node), join);
1571 : }
1572 :
1573 0 : } else if (this_node->details->expected_up == FALSE) {
1574 0 : crm_trace("Node %s controller is down: "
1575 : "member@%lld online@%lld join=%s expected=%s",
1576 : pcmk__node_name(this_node), when_member, when_online,
1577 : pcmk__s(join, "<null>"), pcmk__s(exp_state, "<null>"));
1578 :
1579 : } else {
1580 : /* mark it unclean */
1581 0 : pe_fence_node(scheduler, this_node, "peer is unexpectedly down", FALSE);
1582 0 : crm_info("Node %s member@%lld online@%lld join=%s expected=%s",
1583 : pcmk__node_name(this_node), when_member, when_online,
1584 : pcmk__s(join, "<null>"), pcmk__s(exp_state, "<null>"));
1585 : }
1586 0 : return online;
1587 : }
1588 :
1589 : /*!
1590 : * \internal
1591 : * \brief Check whether a node has taken too long to join controller group
1592 : *
1593 : * \param[in,out] scheduler Scheduler data
1594 : * \param[in] node Node to check
1595 : * \param[in] when_member Epoch time when node became a cluster member
1596 : * \param[in] when_online Epoch time when node joined controller group
1597 : *
1598 : * \return true if node has been pending (on the way up) longer than
1599 : * \c PCMK_OPT_NODE_PENDING_TIMEOUT, otherwise false
1600 : * \note This will also update the cluster's recheck time if appropriate.
1601 : */
1602 : static inline bool
1603 0 : pending_too_long(pcmk_scheduler_t *scheduler, const pcmk_node_t *node,
1604 : long long when_member, long long when_online)
1605 : {
1606 0 : if ((scheduler->node_pending_timeout > 0)
1607 0 : && (when_member > 0) && (when_online <= 0)) {
1608 : // There is a timeout on pending nodes, and node is pending
1609 :
1610 0 : time_t timeout = when_member + scheduler->node_pending_timeout;
1611 :
1612 0 : if (get_effective_time(node->details->data_set) >= timeout) {
1613 0 : return true; // Node has timed out
1614 : }
1615 :
1616 : // Node is pending, but still has time
1617 0 : pe__update_recheck_time(timeout, scheduler, "pending node timeout");
1618 : }
1619 0 : return false;
1620 : }
1621 :
1622 : static bool
1623 0 : determine_online_status_fencing(pcmk_scheduler_t *scheduler,
1624 : const xmlNode *node_state,
1625 : pcmk_node_t *this_node)
1626 : {
1627 0 : bool termination_requested = unpack_node_terminate(this_node, node_state);
1628 0 : const char *join = crm_element_value(node_state, PCMK__XA_JOIN);
1629 0 : const char *exp_state = crm_element_value(node_state, PCMK_XA_EXPECTED);
1630 0 : long long when_member = unpack_node_member(node_state, scheduler);
1631 0 : long long when_online = unpack_node_online(node_state);
1632 :
1633 : /*
1634 : - PCMK__XA_JOIN ::= member|down|pending|banned
1635 : - PCMK_XA_EXPECTED ::= member|down
1636 :
1637 : @COMPAT with entries recorded for DCs < 2.1.7
1638 : - PCMK__XA_IN_CCM ::= true|false
1639 : - PCMK_XA_CRMD ::= online|offline
1640 :
1641 : Since crm_feature_set 3.18.0 (pacemaker-2.1.7):
1642 : - PCMK__XA_IN_CCM ::= <timestamp>|0
1643 : Since when node has been a cluster member. A value 0 of means the node is not
1644 : a cluster member.
1645 :
1646 : - PCMK_XA_CRMD ::= <timestamp>|0
1647 : Since when peer has been online in CPG. A value 0 means the peer is offline
1648 : in CPG.
1649 : */
1650 :
1651 0 : crm_trace("Node %s member@%lld online@%lld join=%s expected=%s%s",
1652 : pcmk__node_name(this_node), when_member, when_online,
1653 : pcmk__s(join, "<null>"), pcmk__s(exp_state, "<null>"),
1654 : (termination_requested? " (termination requested)" : ""));
1655 :
1656 0 : if (this_node->details->shutdown) {
1657 0 : crm_debug("%s is shutting down", pcmk__node_name(this_node));
1658 :
1659 : /* Slightly different criteria since we can't shut down a dead peer */
1660 0 : return (when_online > 0);
1661 : }
1662 :
1663 0 : if (when_member < 0) {
1664 0 : pe_fence_node(scheduler, this_node,
1665 : "peer has not been seen by the cluster", FALSE);
1666 0 : return false;
1667 : }
1668 :
1669 0 : if (pcmk__str_eq(join, CRMD_JOINSTATE_NACK, pcmk__str_none)) {
1670 0 : pe_fence_node(scheduler, this_node,
1671 : "peer failed Pacemaker membership criteria", FALSE);
1672 :
1673 0 : } else if (termination_requested) {
1674 0 : if ((when_member <= 0) && (when_online <= 0)
1675 0 : && pcmk__str_eq(join, CRMD_JOINSTATE_DOWN, pcmk__str_none)) {
1676 0 : crm_info("%s was fenced as requested", pcmk__node_name(this_node));
1677 0 : return false;
1678 : }
1679 0 : pe_fence_node(scheduler, this_node, "fencing was requested", false);
1680 :
1681 0 : } else if (pcmk__str_eq(exp_state, CRMD_JOINSTATE_DOWN,
1682 : pcmk__str_null_matches)) {
1683 :
1684 0 : if (pending_too_long(scheduler, this_node, when_member, when_online)) {
1685 0 : pe_fence_node(scheduler, this_node,
1686 : "peer pending timed out on joining the process group",
1687 : FALSE);
1688 :
1689 0 : } else if ((when_member > 0) || (when_online > 0)) {
1690 0 : crm_info("- %s is not ready to run resources",
1691 : pcmk__node_name(this_node));
1692 0 : this_node->details->standby = TRUE;
1693 0 : this_node->details->pending = TRUE;
1694 :
1695 : } else {
1696 0 : crm_trace("%s is down or still coming up",
1697 : pcmk__node_name(this_node));
1698 : }
1699 :
1700 0 : } else if (when_member <= 0) {
1701 : // Consider PCMK_OPT_PRIORITY_FENCING_DELAY for lost nodes
1702 0 : pe_fence_node(scheduler, this_node,
1703 : "peer is no longer part of the cluster", TRUE);
1704 :
1705 0 : } else if (when_online <= 0) {
1706 0 : pe_fence_node(scheduler, this_node,
1707 : "peer process is no longer available", FALSE);
1708 :
1709 : /* Everything is running at this point, now check join state */
1710 :
1711 0 : } else if (pcmk__str_eq(join, CRMD_JOINSTATE_MEMBER, pcmk__str_none)) {
1712 0 : crm_info("%s is active", pcmk__node_name(this_node));
1713 :
1714 0 : } else if (pcmk__str_any_of(join, CRMD_JOINSTATE_PENDING,
1715 : CRMD_JOINSTATE_DOWN, NULL)) {
1716 0 : crm_info("%s is not ready to run resources",
1717 : pcmk__node_name(this_node));
1718 0 : this_node->details->standby = TRUE;
1719 0 : this_node->details->pending = TRUE;
1720 :
1721 : } else {
1722 0 : pe_fence_node(scheduler, this_node, "peer was in an unknown state",
1723 : FALSE);
1724 : }
1725 :
1726 0 : return (when_member > 0);
1727 : }
1728 :
1729 : static void
1730 0 : determine_remote_online_status(pcmk_scheduler_t *scheduler,
1731 : pcmk_node_t *this_node)
1732 : {
1733 0 : pcmk_resource_t *rsc = this_node->details->remote_rsc;
1734 0 : pcmk_resource_t *container = NULL;
1735 0 : pcmk_node_t *host = NULL;
1736 :
1737 : /* If there is a node state entry for a (former) Pacemaker Remote node
1738 : * but no resource creating that node, the node's connection resource will
1739 : * be NULL. Consider it an offline remote node in that case.
1740 : */
1741 0 : if (rsc == NULL) {
1742 0 : this_node->details->online = FALSE;
1743 0 : goto remote_online_done;
1744 : }
1745 :
1746 0 : container = rsc->container;
1747 :
1748 0 : if (container && pcmk__list_of_1(rsc->running_on)) {
1749 0 : host = rsc->running_on->data;
1750 : }
1751 :
1752 : /* If the resource is currently started, mark it online. */
1753 0 : if (rsc->role == pcmk_role_started) {
1754 0 : crm_trace("%s node %s presumed ONLINE because connection resource is started",
1755 : (container? "Guest" : "Remote"), this_node->details->id);
1756 0 : this_node->details->online = TRUE;
1757 : }
1758 :
1759 : /* consider this node shutting down if transitioning start->stop */
1760 0 : if ((rsc->role == pcmk_role_started)
1761 0 : && (rsc->next_role == pcmk_role_stopped)) {
1762 :
1763 0 : crm_trace("%s node %s shutting down because connection resource is stopping",
1764 : (container? "Guest" : "Remote"), this_node->details->id);
1765 0 : this_node->details->shutdown = TRUE;
1766 : }
1767 :
1768 : /* Now check all the failure conditions. */
1769 0 : if(container && pcmk_is_set(container->flags, pcmk_rsc_failed)) {
1770 0 : crm_trace("Guest node %s UNCLEAN because guest resource failed",
1771 : this_node->details->id);
1772 0 : this_node->details->online = FALSE;
1773 0 : this_node->details->remote_requires_reset = TRUE;
1774 :
1775 0 : } else if (pcmk_is_set(rsc->flags, pcmk_rsc_failed)) {
1776 0 : crm_trace("%s node %s OFFLINE because connection resource failed",
1777 : (container? "Guest" : "Remote"), this_node->details->id);
1778 0 : this_node->details->online = FALSE;
1779 :
1780 0 : } else if ((rsc->role == pcmk_role_stopped)
1781 0 : || ((container != NULL)
1782 0 : && (container->role == pcmk_role_stopped))) {
1783 :
1784 0 : crm_trace("%s node %s OFFLINE because its resource is stopped",
1785 : (container? "Guest" : "Remote"), this_node->details->id);
1786 0 : this_node->details->online = FALSE;
1787 0 : this_node->details->remote_requires_reset = FALSE;
1788 :
1789 0 : } else if (host && (host->details->online == FALSE)
1790 0 : && host->details->unclean) {
1791 0 : crm_trace("Guest node %s UNCLEAN because host is unclean",
1792 : this_node->details->id);
1793 0 : this_node->details->online = FALSE;
1794 0 : this_node->details->remote_requires_reset = TRUE;
1795 : }
1796 :
1797 0 : remote_online_done:
1798 0 : crm_trace("Remote node %s online=%s",
1799 : this_node->details->id, this_node->details->online ? "TRUE" : "FALSE");
1800 0 : }
1801 :
1802 : static void
1803 0 : determine_online_status(const xmlNode *node_state, pcmk_node_t *this_node,
1804 : pcmk_scheduler_t *scheduler)
1805 : {
1806 0 : gboolean online = FALSE;
1807 0 : const char *exp_state = crm_element_value(node_state, PCMK_XA_EXPECTED);
1808 :
1809 0 : CRM_CHECK(this_node != NULL, return);
1810 :
1811 0 : this_node->details->shutdown = FALSE;
1812 0 : this_node->details->expected_up = FALSE;
1813 :
1814 0 : if (pe__shutdown_requested(this_node)) {
1815 0 : this_node->details->shutdown = TRUE;
1816 :
1817 0 : } else if (pcmk__str_eq(exp_state, CRMD_JOINSTATE_MEMBER, pcmk__str_casei)) {
1818 0 : this_node->details->expected_up = TRUE;
1819 : }
1820 :
1821 0 : if (this_node->details->type == node_ping) {
1822 0 : this_node->details->unclean = FALSE;
1823 0 : online = FALSE; /* As far as resource management is concerned,
1824 : * the node is safely offline.
1825 : * Anyone caught abusing this logic will be shot
1826 : */
1827 :
1828 0 : } else if (!pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
1829 0 : online = determine_online_status_no_fencing(scheduler, node_state,
1830 : this_node);
1831 :
1832 : } else {
1833 0 : online = determine_online_status_fencing(scheduler, node_state,
1834 : this_node);
1835 : }
1836 :
1837 0 : if (online) {
1838 0 : this_node->details->online = TRUE;
1839 :
1840 : } else {
1841 : /* remove node from contention */
1842 0 : this_node->fixed = TRUE; // @COMPAT deprecated and unused
1843 0 : this_node->weight = -PCMK_SCORE_INFINITY;
1844 : }
1845 :
1846 0 : if (online && this_node->details->shutdown) {
1847 : /* don't run resources here */
1848 0 : this_node->fixed = TRUE; // @COMPAT deprecated and unused
1849 0 : this_node->weight = -PCMK_SCORE_INFINITY;
1850 : }
1851 :
1852 0 : if (this_node->details->type == node_ping) {
1853 0 : crm_info("%s is not a Pacemaker node", pcmk__node_name(this_node));
1854 :
1855 0 : } else if (this_node->details->unclean) {
1856 0 : pcmk__sched_warn("%s is unclean", pcmk__node_name(this_node));
1857 :
1858 0 : } else if (this_node->details->online) {
1859 0 : crm_info("%s is %s", pcmk__node_name(this_node),
1860 : this_node->details->shutdown ? "shutting down" :
1861 : this_node->details->pending ? "pending" :
1862 : this_node->details->standby ? "standby" :
1863 : this_node->details->maintenance ? "maintenance" : "online");
1864 :
1865 : } else {
1866 0 : crm_trace("%s is offline", pcmk__node_name(this_node));
1867 : }
1868 : }
1869 :
1870 : /*!
1871 : * \internal
1872 : * \brief Find the end of a resource's name, excluding any clone suffix
1873 : *
1874 : * \param[in] id Resource ID to check
1875 : *
1876 : * \return Pointer to last character of resource's base name
1877 : */
1878 : const char *
1879 112 : pe_base_name_end(const char *id)
1880 : {
1881 112 : if (!pcmk__str_empty(id)) {
1882 110 : const char *end = id + strlen(id) - 1;
1883 :
1884 173 : for (const char *s = end; s > id; --s) {
1885 173 : switch (*s) {
1886 63 : case '0':
1887 : case '1':
1888 : case '2':
1889 : case '3':
1890 : case '4':
1891 : case '5':
1892 : case '6':
1893 : case '7':
1894 : case '8':
1895 : case '9':
1896 63 : break;
1897 60 : case ':':
1898 60 : return (s == end)? s : (s - 1);
1899 50 : default:
1900 50 : return end;
1901 : }
1902 : }
1903 0 : return end;
1904 : }
1905 2 : return NULL;
1906 : }
1907 :
1908 : /*!
1909 : * \internal
1910 : * \brief Get a resource name excluding any clone suffix
1911 : *
1912 : * \param[in] last_rsc_id Resource ID to check
1913 : *
1914 : * \return Pointer to newly allocated string with resource's base name
1915 : * \note It is the caller's responsibility to free() the result.
1916 : * This asserts on error, so callers can assume result is not NULL.
1917 : */
1918 : char *
1919 0 : clone_strip(const char *last_rsc_id)
1920 : {
1921 0 : const char *end = pe_base_name_end(last_rsc_id);
1922 0 : char *basename = NULL;
1923 :
1924 0 : CRM_ASSERT(end);
1925 0 : basename = strndup(last_rsc_id, end - last_rsc_id + 1);
1926 0 : CRM_ASSERT(basename);
1927 0 : return basename;
1928 : }
1929 :
1930 : /*!
1931 : * \internal
1932 : * \brief Get the name of the first instance of a cloned resource
1933 : *
1934 : * \param[in] last_rsc_id Resource ID to check
1935 : *
1936 : * \return Pointer to newly allocated string with resource's base name plus :0
1937 : * \note It is the caller's responsibility to free() the result.
1938 : * This asserts on error, so callers can assume result is not NULL.
1939 : */
1940 : char *
1941 0 : clone_zero(const char *last_rsc_id)
1942 : {
1943 0 : const char *end = pe_base_name_end(last_rsc_id);
1944 0 : size_t base_name_len = end - last_rsc_id + 1;
1945 0 : char *zero = NULL;
1946 :
1947 0 : CRM_ASSERT(end);
1948 0 : zero = pcmk__assert_alloc(base_name_len + 3, sizeof(char));
1949 0 : memcpy(zero, last_rsc_id, base_name_len);
1950 0 : zero[base_name_len] = ':';
1951 0 : zero[base_name_len + 1] = '0';
1952 0 : return zero;
1953 : }
1954 :
1955 : static pcmk_resource_t *
1956 0 : create_fake_resource(const char *rsc_id, const xmlNode *rsc_entry,
1957 : pcmk_scheduler_t *scheduler)
1958 : {
1959 0 : pcmk_resource_t *rsc = NULL;
1960 0 : xmlNode *xml_rsc = pcmk__xe_create(NULL, PCMK_XE_PRIMITIVE);
1961 :
1962 0 : pcmk__xe_copy_attrs(xml_rsc, rsc_entry, pcmk__xaf_none);
1963 0 : crm_xml_add(xml_rsc, PCMK_XA_ID, rsc_id);
1964 0 : crm_log_xml_debug(xml_rsc, "Orphan resource");
1965 :
1966 0 : if (pe__unpack_resource(xml_rsc, &rsc, NULL, scheduler) != pcmk_rc_ok) {
1967 0 : return NULL;
1968 : }
1969 :
1970 0 : if (xml_contains_remote_node(xml_rsc)) {
1971 : pcmk_node_t *node;
1972 :
1973 0 : crm_debug("Detected orphaned remote node %s", rsc_id);
1974 0 : node = pcmk_find_node(scheduler, rsc_id);
1975 0 : if (node == NULL) {
1976 0 : node = pe_create_node(rsc_id, rsc_id, PCMK_VALUE_REMOTE, NULL,
1977 : scheduler);
1978 : }
1979 0 : link_rsc2remotenode(scheduler, rsc);
1980 :
1981 0 : if (node) {
1982 0 : crm_trace("Setting node %s as shutting down due to orphaned connection resource", rsc_id);
1983 0 : node->details->shutdown = TRUE;
1984 : }
1985 : }
1986 :
1987 0 : if (crm_element_value(rsc_entry, PCMK__META_CONTAINER)) {
1988 : /* This orphaned rsc needs to be mapped to a container. */
1989 0 : crm_trace("Detected orphaned container filler %s", rsc_id);
1990 0 : pcmk__set_rsc_flags(rsc, pcmk_rsc_removed_filler);
1991 : }
1992 0 : pcmk__set_rsc_flags(rsc, pcmk_rsc_removed);
1993 0 : scheduler->resources = g_list_append(scheduler->resources, rsc);
1994 0 : return rsc;
1995 : }
1996 :
1997 : /*!
1998 : * \internal
1999 : * \brief Create orphan instance for anonymous clone resource history
2000 : *
2001 : * \param[in,out] parent Clone resource that orphan will be added to
2002 : * \param[in] rsc_id Orphan's resource ID
2003 : * \param[in] node Where orphan is active (for logging only)
2004 : * \param[in,out] scheduler Scheduler data
2005 : *
2006 : * \return Newly added orphaned instance of \p parent
2007 : */
2008 : static pcmk_resource_t *
2009 0 : create_anonymous_orphan(pcmk_resource_t *parent, const char *rsc_id,
2010 : const pcmk_node_t *node, pcmk_scheduler_t *scheduler)
2011 : {
2012 0 : pcmk_resource_t *top = pe__create_clone_child(parent, scheduler);
2013 :
2014 : // find_rsc() because we might be a cloned group
2015 0 : pcmk_resource_t *orphan = top->fns->find_rsc(top, rsc_id, NULL,
2016 : pcmk_rsc_match_clone_only);
2017 :
2018 0 : pcmk__rsc_debug(parent, "Created orphan %s for %s: %s on %s",
2019 : top->id, parent->id, rsc_id, pcmk__node_name(node));
2020 0 : return orphan;
2021 : }
2022 :
2023 : /*!
2024 : * \internal
2025 : * \brief Check a node for an instance of an anonymous clone
2026 : *
2027 : * Return a child instance of the specified anonymous clone, in order of
2028 : * preference: (1) the instance running on the specified node, if any;
2029 : * (2) an inactive instance (i.e. within the total of \c PCMK_META_CLONE_MAX
2030 : * instances); (3) a newly created orphan (that is, \c PCMK_META_CLONE_MAX
2031 : * instances are already active).
2032 : *
2033 : * \param[in,out] scheduler Scheduler data
2034 : * \param[in] node Node on which to check for instance
2035 : * \param[in,out] parent Clone to check
2036 : * \param[in] rsc_id Name of cloned resource in history (no instance)
2037 : */
2038 : static pcmk_resource_t *
2039 0 : find_anonymous_clone(pcmk_scheduler_t *scheduler, const pcmk_node_t *node,
2040 : pcmk_resource_t *parent, const char *rsc_id)
2041 : {
2042 0 : GList *rIter = NULL;
2043 0 : pcmk_resource_t *rsc = NULL;
2044 0 : pcmk_resource_t *inactive_instance = NULL;
2045 0 : gboolean skip_inactive = FALSE;
2046 :
2047 0 : CRM_ASSERT(parent != NULL);
2048 0 : CRM_ASSERT(pcmk__is_clone(parent));
2049 0 : CRM_ASSERT(!pcmk_is_set(parent->flags, pcmk_rsc_unique));
2050 :
2051 : // Check for active (or partially active, for cloned groups) instance
2052 0 : pcmk__rsc_trace(parent, "Looking for %s on %s in %s",
2053 : rsc_id, pcmk__node_name(node), parent->id);
2054 0 : for (rIter = parent->children; rsc == NULL && rIter; rIter = rIter->next) {
2055 0 : GList *locations = NULL;
2056 0 : pcmk_resource_t *child = rIter->data;
2057 :
2058 : /* Check whether this instance is already known to be active or pending
2059 : * anywhere, at this stage of unpacking. Because this function is called
2060 : * for a resource before the resource's individual operation history
2061 : * entries are unpacked, locations will generally not contain the
2062 : * desired node.
2063 : *
2064 : * However, there are three exceptions:
2065 : * (1) when child is a cloned group and we have already unpacked the
2066 : * history of another member of the group on the same node;
2067 : * (2) when we've already unpacked the history of another numbered
2068 : * instance on the same node (which can happen if
2069 : * PCMK_META_GLOBALLY_UNIQUE was flipped from true to false); and
2070 : * (3) when we re-run calculations on the same scheduler data as part of
2071 : * a simulation.
2072 : */
2073 0 : child->fns->location(child, &locations, 2);
2074 0 : if (locations) {
2075 : /* We should never associate the same numbered anonymous clone
2076 : * instance with multiple nodes, and clone instances can't migrate,
2077 : * so there must be only one location, regardless of history.
2078 : */
2079 0 : CRM_LOG_ASSERT(locations->next == NULL);
2080 :
2081 0 : if (pcmk__same_node((pcmk_node_t *) locations->data, node)) {
2082 : /* This child instance is active on the requested node, so check
2083 : * for a corresponding configured resource. We use find_rsc()
2084 : * instead of child because child may be a cloned group, and we
2085 : * need the particular member corresponding to rsc_id.
2086 : *
2087 : * If the history entry is orphaned, rsc will be NULL.
2088 : */
2089 0 : rsc = parent->fns->find_rsc(child, rsc_id, NULL,
2090 : pcmk_rsc_match_clone_only);
2091 0 : if (rsc) {
2092 : /* If there are multiple instance history entries for an
2093 : * anonymous clone in a single node's history (which can
2094 : * happen if PCMK_META_GLOBALLY_UNIQUE is switched from true
2095 : * to false), we want to consider the instances beyond the
2096 : * first as orphans, even if there are inactive instance
2097 : * numbers available.
2098 : */
2099 0 : if (rsc->running_on) {
2100 0 : crm_notice("Active (now-)anonymous clone %s has "
2101 : "multiple (orphan) instance histories on %s",
2102 : parent->id, pcmk__node_name(node));
2103 0 : skip_inactive = TRUE;
2104 0 : rsc = NULL;
2105 : } else {
2106 0 : pcmk__rsc_trace(parent, "Resource %s, active", rsc->id);
2107 : }
2108 : }
2109 : }
2110 0 : g_list_free(locations);
2111 :
2112 : } else {
2113 0 : pcmk__rsc_trace(parent, "Resource %s, skip inactive", child->id);
2114 0 : if (!skip_inactive && !inactive_instance
2115 0 : && !pcmk_is_set(child->flags, pcmk_rsc_blocked)) {
2116 : // Remember one inactive instance in case we don't find active
2117 0 : inactive_instance = parent->fns->find_rsc(child, rsc_id, NULL,
2118 : pcmk_rsc_match_clone_only);
2119 :
2120 : /* ... but don't use it if it was already associated with a
2121 : * pending action on another node
2122 : */
2123 0 : if ((inactive_instance != NULL) &&
2124 0 : (inactive_instance->pending_node != NULL) &&
2125 0 : !pcmk__same_node(inactive_instance->pending_node, node)) {
2126 0 : inactive_instance = NULL;
2127 : }
2128 : }
2129 : }
2130 : }
2131 :
2132 0 : if ((rsc == NULL) && !skip_inactive && (inactive_instance != NULL)) {
2133 0 : pcmk__rsc_trace(parent, "Resource %s, empty slot",
2134 : inactive_instance->id);
2135 0 : rsc = inactive_instance;
2136 : }
2137 :
2138 : /* If the resource has PCMK_META_REQUIRES set to PCMK_VALUE_QUORUM or
2139 : * PCMK_VALUE_NOTHING, and we don't have a clone instance for every node, we
2140 : * don't want to consume a valid instance number for unclean nodes. Such
2141 : * instances may appear to be active according to the history, but should be
2142 : * considered inactive, so we can start an instance elsewhere. Treat such
2143 : * instances as orphans.
2144 : *
2145 : * An exception is instances running on guest nodes -- since guest node
2146 : * "fencing" is actually just a resource stop, requires shouldn't apply.
2147 : *
2148 : * @TODO Ideally, we'd use an inactive instance number if it is not needed
2149 : * for any clean instances. However, we don't know that at this point.
2150 : */
2151 0 : if ((rsc != NULL) && !pcmk_is_set(rsc->flags, pcmk_rsc_needs_fencing)
2152 0 : && (!node->details->online || node->details->unclean)
2153 0 : && !pcmk__is_guest_or_bundle_node(node)
2154 0 : && !pe__is_universal_clone(parent, scheduler)) {
2155 :
2156 0 : rsc = NULL;
2157 : }
2158 :
2159 0 : if (rsc == NULL) {
2160 0 : rsc = create_anonymous_orphan(parent, rsc_id, node, scheduler);
2161 0 : pcmk__rsc_trace(parent, "Resource %s, orphan", rsc->id);
2162 : }
2163 0 : return rsc;
2164 : }
2165 :
2166 : static pcmk_resource_t *
2167 0 : unpack_find_resource(pcmk_scheduler_t *scheduler, const pcmk_node_t *node,
2168 : const char *rsc_id)
2169 : {
2170 0 : pcmk_resource_t *rsc = NULL;
2171 0 : pcmk_resource_t *parent = NULL;
2172 :
2173 0 : crm_trace("looking for %s", rsc_id);
2174 0 : rsc = pe_find_resource(scheduler->resources, rsc_id);
2175 :
2176 0 : if (rsc == NULL) {
2177 : /* If we didn't find the resource by its name in the operation history,
2178 : * check it again as a clone instance. Even when PCMK_META_CLONE_MAX=0,
2179 : * we create a single :0 orphan to match against here.
2180 : */
2181 0 : char *clone0_id = clone_zero(rsc_id);
2182 0 : pcmk_resource_t *clone0 = pe_find_resource(scheduler->resources,
2183 : clone0_id);
2184 :
2185 0 : if (clone0 && !pcmk_is_set(clone0->flags, pcmk_rsc_unique)) {
2186 0 : rsc = clone0;
2187 0 : parent = uber_parent(clone0);
2188 0 : crm_trace("%s found as %s (%s)", rsc_id, clone0_id, parent->id);
2189 : } else {
2190 0 : crm_trace("%s is not known as %s either (orphan)",
2191 : rsc_id, clone0_id);
2192 : }
2193 0 : free(clone0_id);
2194 :
2195 0 : } else if (rsc->variant > pcmk_rsc_variant_primitive) {
2196 0 : crm_trace("Resource history for %s is orphaned because it is no longer primitive",
2197 : rsc_id);
2198 0 : return NULL;
2199 :
2200 : } else {
2201 0 : parent = uber_parent(rsc);
2202 : }
2203 :
2204 0 : if (pcmk__is_anonymous_clone(parent)) {
2205 :
2206 0 : if (pcmk__is_bundled(parent)) {
2207 0 : rsc = pe__find_bundle_replica(parent->parent, node);
2208 : } else {
2209 0 : char *base = clone_strip(rsc_id);
2210 :
2211 0 : rsc = find_anonymous_clone(scheduler, node, parent, base);
2212 0 : free(base);
2213 0 : CRM_ASSERT(rsc != NULL);
2214 : }
2215 : }
2216 :
2217 0 : if (rsc && !pcmk__str_eq(rsc_id, rsc->id, pcmk__str_casei)
2218 0 : && !pcmk__str_eq(rsc_id, rsc->clone_name, pcmk__str_casei)) {
2219 :
2220 0 : pcmk__str_update(&rsc->clone_name, rsc_id);
2221 0 : pcmk__rsc_debug(rsc, "Internally renamed %s on %s to %s%s",
2222 : rsc_id, pcmk__node_name(node), rsc->id,
2223 : pcmk_is_set(rsc->flags, pcmk_rsc_removed)? " (ORPHAN)" : "");
2224 : }
2225 0 : return rsc;
2226 : }
2227 :
2228 : static pcmk_resource_t *
2229 0 : process_orphan_resource(const xmlNode *rsc_entry, const pcmk_node_t *node,
2230 : pcmk_scheduler_t *scheduler)
2231 : {
2232 0 : pcmk_resource_t *rsc = NULL;
2233 0 : const char *rsc_id = crm_element_value(rsc_entry, PCMK_XA_ID);
2234 :
2235 0 : crm_debug("Detected orphan resource %s on %s",
2236 : rsc_id, pcmk__node_name(node));
2237 0 : rsc = create_fake_resource(rsc_id, rsc_entry, scheduler);
2238 0 : if (rsc == NULL) {
2239 0 : return NULL;
2240 : }
2241 :
2242 0 : if (!pcmk_is_set(scheduler->flags, pcmk_sched_stop_removed_resources)) {
2243 0 : pcmk__clear_rsc_flags(rsc, pcmk_rsc_managed);
2244 :
2245 : } else {
2246 0 : CRM_CHECK(rsc != NULL, return NULL);
2247 0 : pcmk__rsc_trace(rsc, "Added orphan %s", rsc->id);
2248 0 : resource_location(rsc, NULL, -PCMK_SCORE_INFINITY,
2249 : "__orphan_do_not_run__", scheduler);
2250 : }
2251 0 : return rsc;
2252 : }
2253 :
2254 : static void
2255 0 : process_rsc_state(pcmk_resource_t *rsc, pcmk_node_t *node,
2256 : enum action_fail_response on_fail)
2257 : {
2258 0 : pcmk_node_t *tmpnode = NULL;
2259 0 : char *reason = NULL;
2260 0 : enum action_fail_response save_on_fail = pcmk_on_fail_ignore;
2261 :
2262 0 : CRM_ASSERT(rsc);
2263 0 : pcmk__rsc_trace(rsc, "Resource %s is %s on %s: on_fail=%s",
2264 : rsc->id, pcmk_role_text(rsc->role), pcmk__node_name(node),
2265 : pcmk_on_fail_text(on_fail));
2266 :
2267 : /* process current state */
2268 0 : if (rsc->role != pcmk_role_unknown) {
2269 0 : pcmk_resource_t *iter = rsc;
2270 :
2271 0 : while (iter) {
2272 0 : if (g_hash_table_lookup(iter->known_on, node->details->id) == NULL) {
2273 0 : pcmk_node_t *n = pe__copy_node(node);
2274 :
2275 0 : pcmk__rsc_trace(rsc, "%s%s%s known on %s",
2276 : rsc->id,
2277 : ((rsc->clone_name == NULL)? "" : " also known as "),
2278 : ((rsc->clone_name == NULL)? "" : rsc->clone_name),
2279 : pcmk__node_name(n));
2280 0 : g_hash_table_insert(iter->known_on, (gpointer) n->details->id, n);
2281 : }
2282 0 : if (pcmk_is_set(iter->flags, pcmk_rsc_unique)) {
2283 0 : break;
2284 : }
2285 0 : iter = iter->parent;
2286 : }
2287 : }
2288 :
2289 : /* If a managed resource is believed to be running, but node is down ... */
2290 0 : if ((rsc->role > pcmk_role_stopped)
2291 0 : && node->details->online == FALSE
2292 0 : && node->details->maintenance == FALSE
2293 0 : && pcmk_is_set(rsc->flags, pcmk_rsc_managed)) {
2294 :
2295 0 : gboolean should_fence = FALSE;
2296 :
2297 : /* If this is a guest node, fence it (regardless of whether fencing is
2298 : * enabled, because guest node fencing is done by recovery of the
2299 : * container resource rather than by the fencer). Mark the resource
2300 : * we're processing as failed. When the guest comes back up, its
2301 : * operation history in the CIB will be cleared, freeing the affected
2302 : * resource to run again once we are sure we know its state.
2303 : */
2304 0 : if (pcmk__is_guest_or_bundle_node(node)) {
2305 0 : pcmk__set_rsc_flags(rsc, pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2306 0 : should_fence = TRUE;
2307 :
2308 0 : } else if (pcmk_is_set(rsc->cluster->flags,
2309 : pcmk_sched_fencing_enabled)) {
2310 0 : if (pcmk__is_remote_node(node)
2311 0 : && (node->details->remote_rsc != NULL)
2312 0 : && !pcmk_is_set(node->details->remote_rsc->flags,
2313 : pcmk_rsc_failed)) {
2314 :
2315 : /* Setting unseen means that fencing of the remote node will
2316 : * occur only if the connection resource is not going to start
2317 : * somewhere. This allows connection resources on a failed
2318 : * cluster node to move to another node without requiring the
2319 : * remote nodes to be fenced as well.
2320 : */
2321 0 : node->details->unseen = TRUE;
2322 0 : reason = crm_strdup_printf("%s is active there (fencing will be"
2323 : " revoked if remote connection can "
2324 : "be re-established elsewhere)",
2325 : rsc->id);
2326 : }
2327 0 : should_fence = TRUE;
2328 : }
2329 :
2330 0 : if (should_fence) {
2331 0 : if (reason == NULL) {
2332 0 : reason = crm_strdup_printf("%s is thought to be active there", rsc->id);
2333 : }
2334 0 : pe_fence_node(rsc->cluster, node, reason, FALSE);
2335 : }
2336 0 : free(reason);
2337 : }
2338 :
2339 : /* In order to calculate priority_fencing_delay correctly, save the failure information and pass it to native_add_running(). */
2340 0 : save_on_fail = on_fail;
2341 :
2342 0 : if (node->details->unclean) {
2343 : /* No extra processing needed
2344 : * Also allows resources to be started again after a node is shot
2345 : */
2346 0 : on_fail = pcmk_on_fail_ignore;
2347 : }
2348 :
2349 0 : switch (on_fail) {
2350 0 : case pcmk_on_fail_ignore:
2351 : /* nothing to do */
2352 0 : break;
2353 :
2354 0 : case pcmk_on_fail_demote:
2355 0 : pcmk__set_rsc_flags(rsc, pcmk_rsc_failed);
2356 0 : demote_action(rsc, node, FALSE);
2357 0 : break;
2358 :
2359 0 : case pcmk_on_fail_fence_node:
2360 : /* treat it as if it is still running
2361 : * but also mark the node as unclean
2362 : */
2363 0 : reason = crm_strdup_printf("%s failed there", rsc->id);
2364 0 : pe_fence_node(rsc->cluster, node, reason, FALSE);
2365 0 : free(reason);
2366 0 : break;
2367 :
2368 0 : case pcmk_on_fail_standby_node:
2369 0 : node->details->standby = TRUE;
2370 0 : node->details->standby_onfail = TRUE;
2371 0 : break;
2372 :
2373 0 : case pcmk_on_fail_block:
2374 : /* is_managed == FALSE will prevent any
2375 : * actions being sent for the resource
2376 : */
2377 0 : pcmk__clear_rsc_flags(rsc, pcmk_rsc_managed);
2378 0 : pcmk__set_rsc_flags(rsc, pcmk_rsc_blocked);
2379 0 : break;
2380 :
2381 0 : case pcmk_on_fail_ban:
2382 : /* make sure it comes up somewhere else
2383 : * or not at all
2384 : */
2385 0 : resource_location(rsc, node, -PCMK_SCORE_INFINITY,
2386 : "__action_migration_auto__", rsc->cluster);
2387 0 : break;
2388 :
2389 0 : case pcmk_on_fail_stop:
2390 0 : pe__set_next_role(rsc, pcmk_role_stopped,
2391 : PCMK_META_ON_FAIL "=" PCMK_VALUE_STOP);
2392 0 : break;
2393 :
2394 0 : case pcmk_on_fail_restart:
2395 0 : if ((rsc->role != pcmk_role_stopped)
2396 0 : && (rsc->role != pcmk_role_unknown)) {
2397 0 : pcmk__set_rsc_flags(rsc,
2398 : pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2399 0 : stop_action(rsc, node, FALSE);
2400 : }
2401 0 : break;
2402 :
2403 0 : case pcmk_on_fail_restart_container:
2404 0 : pcmk__set_rsc_flags(rsc, pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2405 0 : if ((rsc->container != NULL) && pcmk__is_bundled(rsc)) {
2406 : /* A bundle's remote connection can run on a different node than
2407 : * the bundle's container. We don't necessarily know where the
2408 : * container is running yet, so remember it and add a stop
2409 : * action for it later.
2410 : */
2411 0 : rsc->cluster->stop_needed =
2412 0 : g_list_prepend(rsc->cluster->stop_needed, rsc->container);
2413 0 : } else if (rsc->container) {
2414 0 : stop_action(rsc->container, node, FALSE);
2415 0 : } else if ((rsc->role != pcmk_role_stopped)
2416 0 : && (rsc->role != pcmk_role_unknown)) {
2417 0 : stop_action(rsc, node, FALSE);
2418 : }
2419 0 : break;
2420 :
2421 0 : case pcmk_on_fail_reset_remote:
2422 0 : pcmk__set_rsc_flags(rsc, pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2423 0 : if (pcmk_is_set(rsc->cluster->flags, pcmk_sched_fencing_enabled)) {
2424 0 : tmpnode = NULL;
2425 0 : if (rsc->is_remote_node) {
2426 0 : tmpnode = pcmk_find_node(rsc->cluster, rsc->id);
2427 : }
2428 0 : if (pcmk__is_remote_node(tmpnode)
2429 0 : && !(tmpnode->details->remote_was_fenced)) {
2430 : /* The remote connection resource failed in a way that
2431 : * should result in fencing the remote node.
2432 : */
2433 0 : pe_fence_node(rsc->cluster, tmpnode,
2434 : "remote connection is unrecoverable", FALSE);
2435 : }
2436 : }
2437 :
2438 : /* require the stop action regardless if fencing is occurring or not. */
2439 0 : if (rsc->role > pcmk_role_stopped) {
2440 0 : stop_action(rsc, node, FALSE);
2441 : }
2442 :
2443 : /* if reconnect delay is in use, prevent the connection from exiting the
2444 : * "STOPPED" role until the failure is cleared by the delay timeout. */
2445 0 : if (rsc->remote_reconnect_ms) {
2446 0 : pe__set_next_role(rsc, pcmk_role_stopped, "remote reset");
2447 : }
2448 0 : break;
2449 : }
2450 :
2451 : /* ensure a remote-node connection failure forces an unclean remote-node
2452 : * to be fenced. By setting unseen = FALSE, the remote-node failure will
2453 : * result in a fencing operation regardless if we're going to attempt to
2454 : * reconnect to the remote-node in this transition or not. */
2455 0 : if (pcmk_is_set(rsc->flags, pcmk_rsc_failed) && rsc->is_remote_node) {
2456 0 : tmpnode = pcmk_find_node(rsc->cluster, rsc->id);
2457 0 : if (tmpnode && tmpnode->details->unclean) {
2458 0 : tmpnode->details->unseen = FALSE;
2459 : }
2460 : }
2461 :
2462 0 : if ((rsc->role != pcmk_role_stopped)
2463 0 : && (rsc->role != pcmk_role_unknown)) {
2464 0 : if (pcmk_is_set(rsc->flags, pcmk_rsc_removed)) {
2465 0 : if (pcmk_is_set(rsc->flags, pcmk_rsc_managed)) {
2466 0 : crm_notice("Removed resource %s is active on %s and will be "
2467 : "stopped when possible",
2468 : rsc->id, pcmk__node_name(node));
2469 : } else {
2470 0 : crm_notice("Removed resource %s must be stopped manually on %s "
2471 : "because " PCMK_OPT_STOP_ORPHAN_RESOURCES
2472 : " is set to false", rsc->id, pcmk__node_name(node));
2473 : }
2474 : }
2475 :
2476 0 : native_add_running(rsc, node, rsc->cluster,
2477 : (save_on_fail != pcmk_on_fail_ignore));
2478 0 : switch (on_fail) {
2479 0 : case pcmk_on_fail_ignore:
2480 0 : break;
2481 0 : case pcmk_on_fail_demote:
2482 : case pcmk_on_fail_block:
2483 0 : pcmk__set_rsc_flags(rsc, pcmk_rsc_failed);
2484 0 : break;
2485 0 : default:
2486 0 : pcmk__set_rsc_flags(rsc,
2487 : pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
2488 0 : break;
2489 : }
2490 :
2491 0 : } else if (rsc->clone_name && strchr(rsc->clone_name, ':') != NULL) {
2492 : /* Only do this for older status sections that included instance numbers
2493 : * Otherwise stopped instances will appear as orphans
2494 : */
2495 0 : pcmk__rsc_trace(rsc, "Resetting clone_name %s for %s (stopped)",
2496 : rsc->clone_name, rsc->id);
2497 0 : free(rsc->clone_name);
2498 0 : rsc->clone_name = NULL;
2499 :
2500 : } else {
2501 0 : GList *possible_matches = pe__resource_actions(rsc, node,
2502 : PCMK_ACTION_STOP, FALSE);
2503 0 : GList *gIter = possible_matches;
2504 :
2505 0 : for (; gIter != NULL; gIter = gIter->next) {
2506 0 : pcmk_action_t *stop = (pcmk_action_t *) gIter->data;
2507 :
2508 0 : pcmk__set_action_flags(stop, pcmk_action_optional);
2509 : }
2510 :
2511 0 : g_list_free(possible_matches);
2512 : }
2513 :
2514 : /* A successful stop after migrate_to on the migration source doesn't make
2515 : * the partially migrated resource stopped on the migration target.
2516 : */
2517 0 : if ((rsc->role == pcmk_role_stopped)
2518 0 : && rsc->partial_migration_source
2519 0 : && rsc->partial_migration_source->details == node->details
2520 0 : && rsc->partial_migration_target
2521 0 : && rsc->running_on) {
2522 :
2523 0 : rsc->role = pcmk_role_started;
2524 : }
2525 0 : }
2526 :
2527 : /* create active recurring operations as optional */
2528 : static void
2529 0 : process_recurring(pcmk_node_t *node, pcmk_resource_t *rsc,
2530 : int start_index, int stop_index,
2531 : GList *sorted_op_list, pcmk_scheduler_t *scheduler)
2532 : {
2533 0 : int counter = -1;
2534 0 : const char *task = NULL;
2535 0 : const char *status = NULL;
2536 0 : GList *gIter = sorted_op_list;
2537 :
2538 0 : CRM_ASSERT(rsc);
2539 0 : pcmk__rsc_trace(rsc, "%s: Start index %d, stop index = %d",
2540 : rsc->id, start_index, stop_index);
2541 :
2542 0 : for (; gIter != NULL; gIter = gIter->next) {
2543 0 : xmlNode *rsc_op = (xmlNode *) gIter->data;
2544 :
2545 0 : guint interval_ms = 0;
2546 0 : char *key = NULL;
2547 0 : const char *id = pcmk__xe_id(rsc_op);
2548 :
2549 0 : counter++;
2550 :
2551 0 : if (node->details->online == FALSE) {
2552 0 : pcmk__rsc_trace(rsc, "Skipping %s on %s: node is offline",
2553 : rsc->id, pcmk__node_name(node));
2554 0 : break;
2555 :
2556 : /* Need to check if there's a monitor for role="Stopped" */
2557 0 : } else if (start_index < stop_index && counter <= stop_index) {
2558 0 : pcmk__rsc_trace(rsc, "Skipping %s on %s: resource is not active",
2559 : id, pcmk__node_name(node));
2560 0 : continue;
2561 :
2562 0 : } else if (counter < start_index) {
2563 0 : pcmk__rsc_trace(rsc, "Skipping %s on %s: old %d",
2564 : id, pcmk__node_name(node), counter);
2565 0 : continue;
2566 : }
2567 :
2568 0 : crm_element_value_ms(rsc_op, PCMK_META_INTERVAL, &interval_ms);
2569 0 : if (interval_ms == 0) {
2570 0 : pcmk__rsc_trace(rsc, "Skipping %s on %s: non-recurring",
2571 : id, pcmk__node_name(node));
2572 0 : continue;
2573 : }
2574 :
2575 0 : status = crm_element_value(rsc_op, PCMK__XA_OP_STATUS);
2576 0 : if (pcmk__str_eq(status, "-1", pcmk__str_casei)) {
2577 0 : pcmk__rsc_trace(rsc, "Skipping %s on %s: status",
2578 : id, pcmk__node_name(node));
2579 0 : continue;
2580 : }
2581 0 : task = crm_element_value(rsc_op, PCMK_XA_OPERATION);
2582 : /* create the action */
2583 0 : key = pcmk__op_key(rsc->id, task, interval_ms);
2584 0 : pcmk__rsc_trace(rsc, "Creating %s on %s", key, pcmk__node_name(node));
2585 0 : custom_action(rsc, key, task, node, TRUE, scheduler);
2586 : }
2587 0 : }
2588 :
2589 : void
2590 0 : calculate_active_ops(const GList *sorted_op_list, int *start_index,
2591 : int *stop_index)
2592 : {
2593 0 : int counter = -1;
2594 0 : int implied_monitor_start = -1;
2595 0 : int implied_clone_start = -1;
2596 0 : const char *task = NULL;
2597 0 : const char *status = NULL;
2598 :
2599 0 : *stop_index = -1;
2600 0 : *start_index = -1;
2601 :
2602 0 : for (const GList *iter = sorted_op_list; iter != NULL; iter = iter->next) {
2603 0 : const xmlNode *rsc_op = (const xmlNode *) iter->data;
2604 :
2605 0 : counter++;
2606 :
2607 0 : task = crm_element_value(rsc_op, PCMK_XA_OPERATION);
2608 0 : status = crm_element_value(rsc_op, PCMK__XA_OP_STATUS);
2609 :
2610 0 : if (pcmk__str_eq(task, PCMK_ACTION_STOP, pcmk__str_casei)
2611 0 : && pcmk__str_eq(status, "0", pcmk__str_casei)) {
2612 0 : *stop_index = counter;
2613 :
2614 0 : } else if (pcmk__strcase_any_of(task, PCMK_ACTION_START,
2615 : PCMK_ACTION_MIGRATE_FROM, NULL)) {
2616 0 : *start_index = counter;
2617 :
2618 0 : } else if ((implied_monitor_start <= *stop_index)
2619 0 : && pcmk__str_eq(task, PCMK_ACTION_MONITOR,
2620 0 : pcmk__str_casei)) {
2621 0 : const char *rc = crm_element_value(rsc_op, PCMK__XA_RC_CODE);
2622 :
2623 0 : if (pcmk__strcase_any_of(rc, "0", "8", NULL)) {
2624 0 : implied_monitor_start = counter;
2625 : }
2626 0 : } else if (pcmk__strcase_any_of(task, PCMK_ACTION_PROMOTE,
2627 : PCMK_ACTION_DEMOTE, NULL)) {
2628 0 : implied_clone_start = counter;
2629 : }
2630 : }
2631 :
2632 0 : if (*start_index == -1) {
2633 0 : if (implied_clone_start != -1) {
2634 0 : *start_index = implied_clone_start;
2635 0 : } else if (implied_monitor_start != -1) {
2636 0 : *start_index = implied_monitor_start;
2637 : }
2638 : }
2639 0 : }
2640 :
2641 : // If resource history entry has shutdown lock, remember lock node and time
2642 : static void
2643 0 : unpack_shutdown_lock(const xmlNode *rsc_entry, pcmk_resource_t *rsc,
2644 : const pcmk_node_t *node, pcmk_scheduler_t *scheduler)
2645 : {
2646 0 : time_t lock_time = 0; // When lock started (i.e. node shutdown time)
2647 :
2648 0 : if ((crm_element_value_epoch(rsc_entry, PCMK_OPT_SHUTDOWN_LOCK,
2649 0 : &lock_time) == pcmk_ok) && (lock_time != 0)) {
2650 :
2651 0 : if ((scheduler->shutdown_lock > 0)
2652 0 : && (get_effective_time(scheduler)
2653 0 : > (lock_time + scheduler->shutdown_lock))) {
2654 0 : pcmk__rsc_info(rsc, "Shutdown lock for %s on %s expired",
2655 : rsc->id, pcmk__node_name(node));
2656 0 : pe__clear_resource_history(rsc, node);
2657 : } else {
2658 : /* @COMPAT I don't like breaking const signatures, but
2659 : * rsc->lock_node should really be const -- we just can't change it
2660 : * until the next API compatibility break.
2661 : */
2662 0 : rsc->lock_node = (pcmk_node_t *) node;
2663 0 : rsc->lock_time = lock_time;
2664 : }
2665 : }
2666 0 : }
2667 :
2668 : /*!
2669 : * \internal
2670 : * \brief Unpack one \c PCMK__XE_LRM_RESOURCE entry from a node's CIB status
2671 : *
2672 : * \param[in,out] node Node whose status is being unpacked
2673 : * \param[in] rsc_entry \c PCMK__XE_LRM_RESOURCE XML being unpacked
2674 : * \param[in,out] scheduler Scheduler data
2675 : *
2676 : * \return Resource corresponding to the entry, or NULL if no operation history
2677 : */
2678 : static pcmk_resource_t *
2679 0 : unpack_lrm_resource(pcmk_node_t *node, const xmlNode *lrm_resource,
2680 : pcmk_scheduler_t *scheduler)
2681 : {
2682 0 : GList *gIter = NULL;
2683 0 : int stop_index = -1;
2684 0 : int start_index = -1;
2685 0 : enum rsc_role_e req_role = pcmk_role_unknown;
2686 :
2687 0 : const char *rsc_id = pcmk__xe_id(lrm_resource);
2688 :
2689 0 : pcmk_resource_t *rsc = NULL;
2690 0 : GList *op_list = NULL;
2691 0 : GList *sorted_op_list = NULL;
2692 :
2693 0 : xmlNode *rsc_op = NULL;
2694 0 : xmlNode *last_failure = NULL;
2695 :
2696 0 : enum action_fail_response on_fail = pcmk_on_fail_ignore;
2697 0 : enum rsc_role_e saved_role = pcmk_role_unknown;
2698 :
2699 0 : if (rsc_id == NULL) {
2700 0 : pcmk__config_err("Ignoring invalid " PCMK__XE_LRM_RESOURCE
2701 : " entry: No " PCMK_XA_ID);
2702 0 : crm_log_xml_info(lrm_resource, "missing-id");
2703 0 : return NULL;
2704 : }
2705 0 : crm_trace("Unpacking " PCMK__XE_LRM_RESOURCE " for %s on %s",
2706 : rsc_id, pcmk__node_name(node));
2707 :
2708 : /* Build a list of individual PCMK__XE_LRM_RSC_OP entries, so we can sort
2709 : * them
2710 : */
2711 0 : for (rsc_op = pcmk__xe_first_child(lrm_resource, PCMK__XE_LRM_RSC_OP, NULL,
2712 : NULL);
2713 0 : rsc_op != NULL; rsc_op = pcmk__xe_next_same(rsc_op)) {
2714 :
2715 0 : op_list = g_list_prepend(op_list, rsc_op);
2716 : }
2717 :
2718 0 : if (!pcmk_is_set(scheduler->flags, pcmk_sched_shutdown_lock)) {
2719 0 : if (op_list == NULL) {
2720 : // If there are no operations, there is nothing to do
2721 0 : return NULL;
2722 : }
2723 : }
2724 :
2725 : /* find the resource */
2726 0 : rsc = unpack_find_resource(scheduler, node, rsc_id);
2727 0 : if (rsc == NULL) {
2728 0 : if (op_list == NULL) {
2729 : // If there are no operations, there is nothing to do
2730 0 : return NULL;
2731 : } else {
2732 0 : rsc = process_orphan_resource(lrm_resource, node, scheduler);
2733 : }
2734 : }
2735 0 : CRM_ASSERT(rsc != NULL);
2736 :
2737 : // Check whether the resource is "shutdown-locked" to this node
2738 0 : if (pcmk_is_set(scheduler->flags, pcmk_sched_shutdown_lock)) {
2739 0 : unpack_shutdown_lock(lrm_resource, rsc, node, scheduler);
2740 : }
2741 :
2742 : /* process operations */
2743 0 : saved_role = rsc->role;
2744 0 : rsc->role = pcmk_role_unknown;
2745 0 : sorted_op_list = g_list_sort(op_list, sort_op_by_callid);
2746 :
2747 0 : for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) {
2748 0 : xmlNode *rsc_op = (xmlNode *) gIter->data;
2749 :
2750 0 : unpack_rsc_op(rsc, node, rsc_op, &last_failure, &on_fail);
2751 : }
2752 :
2753 : /* create active recurring operations as optional */
2754 0 : calculate_active_ops(sorted_op_list, &start_index, &stop_index);
2755 0 : process_recurring(node, rsc, start_index, stop_index, sorted_op_list,
2756 : scheduler);
2757 :
2758 : /* no need to free the contents */
2759 0 : g_list_free(sorted_op_list);
2760 :
2761 0 : process_rsc_state(rsc, node, on_fail);
2762 :
2763 0 : if (get_target_role(rsc, &req_role)) {
2764 0 : if ((rsc->next_role == pcmk_role_unknown)
2765 0 : || (req_role < rsc->next_role)) {
2766 :
2767 0 : pe__set_next_role(rsc, req_role, PCMK_META_TARGET_ROLE);
2768 :
2769 0 : } else if (req_role > rsc->next_role) {
2770 0 : pcmk__rsc_info(rsc,
2771 : "%s: Not overwriting calculated next role %s"
2772 : " with requested next role %s",
2773 : rsc->id, pcmk_role_text(rsc->next_role),
2774 : pcmk_role_text(req_role));
2775 : }
2776 : }
2777 :
2778 0 : if (saved_role > rsc->role) {
2779 0 : rsc->role = saved_role;
2780 : }
2781 :
2782 0 : return rsc;
2783 : }
2784 :
2785 : static void
2786 0 : handle_orphaned_container_fillers(const xmlNode *lrm_rsc_list,
2787 : pcmk_scheduler_t *scheduler)
2788 : {
2789 0 : for (const xmlNode *rsc_entry = pcmk__xe_first_child(lrm_rsc_list, NULL,
2790 : NULL, NULL);
2791 0 : rsc_entry != NULL; rsc_entry = pcmk__xe_next(rsc_entry)) {
2792 :
2793 : pcmk_resource_t *rsc;
2794 : pcmk_resource_t *container;
2795 : const char *rsc_id;
2796 : const char *container_id;
2797 :
2798 0 : if (!pcmk__xe_is(rsc_entry, PCMK__XE_LRM_RESOURCE)) {
2799 0 : continue;
2800 : }
2801 :
2802 0 : container_id = crm_element_value(rsc_entry, PCMK__META_CONTAINER);
2803 0 : rsc_id = crm_element_value(rsc_entry, PCMK_XA_ID);
2804 0 : if (container_id == NULL || rsc_id == NULL) {
2805 0 : continue;
2806 : }
2807 :
2808 0 : container = pe_find_resource(scheduler->resources, container_id);
2809 0 : if (container == NULL) {
2810 0 : continue;
2811 : }
2812 :
2813 0 : rsc = pe_find_resource(scheduler->resources, rsc_id);
2814 0 : if ((rsc == NULL) || (rsc->container != NULL)
2815 0 : || !pcmk_is_set(rsc->flags, pcmk_rsc_removed_filler)) {
2816 0 : continue;
2817 : }
2818 :
2819 0 : pcmk__rsc_trace(rsc, "Mapped container of orphaned resource %s to %s",
2820 : rsc->id, container_id);
2821 0 : rsc->container = container;
2822 0 : container->fillers = g_list_append(container->fillers, rsc);
2823 : }
2824 0 : }
2825 :
2826 : /*!
2827 : * \internal
2828 : * \brief Unpack one node's lrm status section
2829 : *
2830 : * \param[in,out] node Node whose status is being unpacked
2831 : * \param[in] xml CIB node state XML
2832 : * \param[in,out] scheduler Scheduler data
2833 : */
2834 : static void
2835 0 : unpack_node_lrm(pcmk_node_t *node, const xmlNode *xml,
2836 : pcmk_scheduler_t *scheduler)
2837 : {
2838 0 : bool found_orphaned_container_filler = false;
2839 :
2840 : // Drill down to PCMK__XE_LRM_RESOURCES section
2841 0 : xml = pcmk__xe_first_child(xml, PCMK__XE_LRM, NULL, NULL);
2842 0 : if (xml == NULL) {
2843 0 : return;
2844 : }
2845 0 : xml = pcmk__xe_first_child(xml, PCMK__XE_LRM_RESOURCES, NULL, NULL);
2846 0 : if (xml == NULL) {
2847 0 : return;
2848 : }
2849 :
2850 : // Unpack each PCMK__XE_LRM_RESOURCE entry
2851 0 : for (const xmlNode *rsc_entry = pcmk__xe_first_child(xml,
2852 : PCMK__XE_LRM_RESOURCE,
2853 : NULL, NULL);
2854 0 : rsc_entry != NULL; rsc_entry = pcmk__xe_next_same(rsc_entry)) {
2855 :
2856 0 : pcmk_resource_t *rsc = unpack_lrm_resource(node, rsc_entry, scheduler);
2857 :
2858 0 : if ((rsc != NULL)
2859 0 : && pcmk_is_set(rsc->flags, pcmk_rsc_removed_filler)) {
2860 0 : found_orphaned_container_filler = true;
2861 : }
2862 : }
2863 :
2864 : /* Now that all resource state has been unpacked for this node, map any
2865 : * orphaned container fillers to their container resource.
2866 : */
2867 0 : if (found_orphaned_container_filler) {
2868 0 : handle_orphaned_container_fillers(xml, scheduler);
2869 : }
2870 : }
2871 :
2872 : static void
2873 0 : set_active(pcmk_resource_t *rsc)
2874 : {
2875 0 : const pcmk_resource_t *top = pe__const_top_resource(rsc, false);
2876 :
2877 0 : if (top && pcmk_is_set(top->flags, pcmk_rsc_promotable)) {
2878 0 : rsc->role = pcmk_role_unpromoted;
2879 : } else {
2880 0 : rsc->role = pcmk_role_started;
2881 : }
2882 0 : }
2883 :
2884 : static void
2885 0 : set_node_score(gpointer key, gpointer value, gpointer user_data)
2886 : {
2887 0 : pcmk_node_t *node = value;
2888 0 : int *score = user_data;
2889 :
2890 0 : node->weight = *score;
2891 0 : }
2892 :
2893 : #define XPATH_NODE_STATE "/" PCMK_XE_CIB "/" PCMK_XE_STATUS \
2894 : "/" PCMK__XE_NODE_STATE
2895 : #define SUB_XPATH_LRM_RESOURCE "/" PCMK__XE_LRM \
2896 : "/" PCMK__XE_LRM_RESOURCES \
2897 : "/" PCMK__XE_LRM_RESOURCE
2898 : #define SUB_XPATH_LRM_RSC_OP "/" PCMK__XE_LRM_RSC_OP
2899 :
2900 : static xmlNode *
2901 0 : find_lrm_op(const char *resource, const char *op, const char *node, const char *source,
2902 : int target_rc, pcmk_scheduler_t *scheduler)
2903 : {
2904 0 : GString *xpath = NULL;
2905 0 : xmlNode *xml = NULL;
2906 :
2907 0 : CRM_CHECK((resource != NULL) && (op != NULL) && (node != NULL),
2908 : return NULL);
2909 :
2910 0 : xpath = g_string_sized_new(256);
2911 0 : pcmk__g_strcat(xpath,
2912 : XPATH_NODE_STATE "[@" PCMK_XA_UNAME "='", node, "']"
2913 : SUB_XPATH_LRM_RESOURCE "[@" PCMK_XA_ID "='", resource, "']"
2914 : SUB_XPATH_LRM_RSC_OP "[@" PCMK_XA_OPERATION "='", op, "'",
2915 : NULL);
2916 :
2917 : /* Need to check against transition_magic too? */
2918 0 : if ((source != NULL) && (strcmp(op, PCMK_ACTION_MIGRATE_TO) == 0)) {
2919 0 : pcmk__g_strcat(xpath,
2920 : " and @" PCMK__META_MIGRATE_TARGET "='", source, "']",
2921 : NULL);
2922 :
2923 0 : } else if ((source != NULL)
2924 0 : && (strcmp(op, PCMK_ACTION_MIGRATE_FROM) == 0)) {
2925 0 : pcmk__g_strcat(xpath,
2926 : " and @" PCMK__META_MIGRATE_SOURCE "='", source, "']",
2927 : NULL);
2928 : } else {
2929 : g_string_append_c(xpath, ']');
2930 : }
2931 :
2932 0 : xml = get_xpath_object((const char *) xpath->str, scheduler->input,
2933 : LOG_DEBUG);
2934 0 : g_string_free(xpath, TRUE);
2935 :
2936 0 : if (xml && target_rc >= 0) {
2937 0 : int rc = PCMK_OCF_UNKNOWN_ERROR;
2938 0 : int status = PCMK_EXEC_ERROR;
2939 :
2940 0 : crm_element_value_int(xml, PCMK__XA_RC_CODE, &rc);
2941 0 : crm_element_value_int(xml, PCMK__XA_OP_STATUS, &status);
2942 0 : if ((rc != target_rc) || (status != PCMK_EXEC_DONE)) {
2943 0 : return NULL;
2944 : }
2945 : }
2946 0 : return xml;
2947 : }
2948 :
2949 : static xmlNode *
2950 0 : find_lrm_resource(const char *rsc_id, const char *node_name,
2951 : pcmk_scheduler_t *scheduler)
2952 : {
2953 0 : GString *xpath = NULL;
2954 0 : xmlNode *xml = NULL;
2955 :
2956 0 : CRM_CHECK((rsc_id != NULL) && (node_name != NULL), return NULL);
2957 :
2958 0 : xpath = g_string_sized_new(256);
2959 0 : pcmk__g_strcat(xpath,
2960 : XPATH_NODE_STATE "[@" PCMK_XA_UNAME "='", node_name, "']"
2961 : SUB_XPATH_LRM_RESOURCE "[@" PCMK_XA_ID "='", rsc_id, "']",
2962 : NULL);
2963 :
2964 0 : xml = get_xpath_object((const char *) xpath->str, scheduler->input,
2965 : LOG_DEBUG);
2966 :
2967 0 : g_string_free(xpath, TRUE);
2968 0 : return xml;
2969 : }
2970 :
2971 : /*!
2972 : * \internal
2973 : * \brief Check whether a resource has no completed action history on a node
2974 : *
2975 : * \param[in,out] rsc Resource to check
2976 : * \param[in] node_name Node to check
2977 : *
2978 : * \return true if \p rsc_id is unknown on \p node_name, otherwise false
2979 : */
2980 : static bool
2981 0 : unknown_on_node(pcmk_resource_t *rsc, const char *node_name)
2982 : {
2983 0 : bool result = false;
2984 : xmlXPathObjectPtr search;
2985 0 : char *xpath = NULL;
2986 :
2987 0 : xpath = crm_strdup_printf(XPATH_NODE_STATE "[@" PCMK_XA_UNAME "='%s']"
2988 : SUB_XPATH_LRM_RESOURCE "[@" PCMK_XA_ID "='%s']"
2989 : SUB_XPATH_LRM_RSC_OP
2990 : "[@" PCMK__XA_RC_CODE "!='%d']",
2991 : node_name, rsc->id, PCMK_OCF_UNKNOWN);
2992 :
2993 0 : search = xpath_search(rsc->cluster->input, xpath);
2994 0 : result = (numXpathResults(search) == 0);
2995 0 : freeXpathObject(search);
2996 0 : free(xpath);
2997 0 : return result;
2998 : }
2999 :
3000 : /*!
3001 : * \brief Check whether a probe/monitor indicating the resource was not running
3002 : * on a node happened after some event
3003 : *
3004 : * \param[in] rsc_id Resource being checked
3005 : * \param[in] node_name Node being checked
3006 : * \param[in] xml_op Event that monitor is being compared to
3007 : * \param[in] same_node Whether the operations are on the same node
3008 : * \param[in,out] scheduler Scheduler data
3009 : *
3010 : * \return true if such a monitor happened after event, false otherwise
3011 : */
3012 : static bool
3013 0 : monitor_not_running_after(const char *rsc_id, const char *node_name,
3014 : const xmlNode *xml_op, bool same_node,
3015 : pcmk_scheduler_t *scheduler)
3016 : {
3017 : /* Any probe/monitor operation on the node indicating it was not running
3018 : * there
3019 : */
3020 0 : xmlNode *monitor = find_lrm_op(rsc_id, PCMK_ACTION_MONITOR, node_name,
3021 : NULL, PCMK_OCF_NOT_RUNNING, scheduler);
3022 :
3023 0 : return (monitor && pe__is_newer_op(monitor, xml_op, same_node) > 0);
3024 : }
3025 :
3026 : /*!
3027 : * \brief Check whether any non-monitor operation on a node happened after some
3028 : * event
3029 : *
3030 : * \param[in] rsc_id Resource being checked
3031 : * \param[in] node_name Node being checked
3032 : * \param[in] xml_op Event that non-monitor is being compared to
3033 : * \param[in] same_node Whether the operations are on the same node
3034 : * \param[in,out] scheduler Scheduler data
3035 : *
3036 : * \return true if such a operation happened after event, false otherwise
3037 : */
3038 : static bool
3039 0 : non_monitor_after(const char *rsc_id, const char *node_name,
3040 : const xmlNode *xml_op, bool same_node,
3041 : pcmk_scheduler_t *scheduler)
3042 : {
3043 0 : xmlNode *lrm_resource = NULL;
3044 :
3045 0 : lrm_resource = find_lrm_resource(rsc_id, node_name, scheduler);
3046 0 : if (lrm_resource == NULL) {
3047 0 : return false;
3048 : }
3049 :
3050 0 : for (xmlNode *op = pcmk__xe_first_child(lrm_resource, PCMK__XE_LRM_RSC_OP,
3051 : NULL, NULL);
3052 0 : op != NULL; op = pcmk__xe_next_same(op)) {
3053 :
3054 0 : const char * task = NULL;
3055 :
3056 0 : if (op == xml_op) {
3057 0 : continue;
3058 : }
3059 :
3060 0 : task = crm_element_value(op, PCMK_XA_OPERATION);
3061 :
3062 0 : if (pcmk__str_any_of(task, PCMK_ACTION_START, PCMK_ACTION_STOP,
3063 : PCMK_ACTION_MIGRATE_TO, PCMK_ACTION_MIGRATE_FROM,
3064 : NULL)
3065 0 : && pe__is_newer_op(op, xml_op, same_node) > 0) {
3066 0 : return true;
3067 : }
3068 : }
3069 :
3070 0 : return false;
3071 : }
3072 :
3073 : /*!
3074 : * \brief Check whether the resource has newer state on a node after a migration
3075 : * attempt
3076 : *
3077 : * \param[in] rsc_id Resource being checked
3078 : * \param[in] node_name Node being checked
3079 : * \param[in] migrate_to Any migrate_to event that is being compared to
3080 : * \param[in] migrate_from Any migrate_from event that is being compared to
3081 : * \param[in,out] scheduler Scheduler data
3082 : *
3083 : * \return true if such a operation happened after event, false otherwise
3084 : */
3085 : static bool
3086 0 : newer_state_after_migrate(const char *rsc_id, const char *node_name,
3087 : const xmlNode *migrate_to,
3088 : const xmlNode *migrate_from,
3089 : pcmk_scheduler_t *scheduler)
3090 : {
3091 0 : const xmlNode *xml_op = migrate_to;
3092 0 : const char *source = NULL;
3093 0 : const char *target = NULL;
3094 0 : bool same_node = false;
3095 :
3096 0 : if (migrate_from) {
3097 0 : xml_op = migrate_from;
3098 : }
3099 :
3100 0 : source = crm_element_value(xml_op, PCMK__META_MIGRATE_SOURCE);
3101 0 : target = crm_element_value(xml_op, PCMK__META_MIGRATE_TARGET);
3102 :
3103 : /* It's preferred to compare to the migrate event on the same node if
3104 : * existing, since call ids are more reliable.
3105 : */
3106 0 : if (pcmk__str_eq(node_name, target, pcmk__str_casei)) {
3107 0 : if (migrate_from) {
3108 0 : xml_op = migrate_from;
3109 0 : same_node = true;
3110 :
3111 : } else {
3112 0 : xml_op = migrate_to;
3113 : }
3114 :
3115 0 : } else if (pcmk__str_eq(node_name, source, pcmk__str_casei)) {
3116 0 : if (migrate_to) {
3117 0 : xml_op = migrate_to;
3118 0 : same_node = true;
3119 :
3120 : } else {
3121 0 : xml_op = migrate_from;
3122 : }
3123 : }
3124 :
3125 : /* If there's any newer non-monitor operation on the node, or any newer
3126 : * probe/monitor operation on the node indicating it was not running there,
3127 : * the migration events potentially no longer matter for the node.
3128 : */
3129 0 : return non_monitor_after(rsc_id, node_name, xml_op, same_node, scheduler)
3130 0 : || monitor_not_running_after(rsc_id, node_name, xml_op, same_node,
3131 : scheduler);
3132 : }
3133 :
3134 : /*!
3135 : * \internal
3136 : * \brief Parse migration source and target node names from history entry
3137 : *
3138 : * \param[in] entry Resource history entry for a migration action
3139 : * \param[in] source_node If not NULL, source must match this node
3140 : * \param[in] target_node If not NULL, target must match this node
3141 : * \param[out] source_name Where to store migration source node name
3142 : * \param[out] target_name Where to store migration target node name
3143 : *
3144 : * \return Standard Pacemaker return code
3145 : */
3146 : static int
3147 0 : get_migration_node_names(const xmlNode *entry, const pcmk_node_t *source_node,
3148 : const pcmk_node_t *target_node,
3149 : const char **source_name, const char **target_name)
3150 : {
3151 0 : *source_name = crm_element_value(entry, PCMK__META_MIGRATE_SOURCE);
3152 0 : *target_name = crm_element_value(entry, PCMK__META_MIGRATE_TARGET);
3153 0 : if ((*source_name == NULL) || (*target_name == NULL)) {
3154 0 : pcmk__config_err("Ignoring resource history entry %s without "
3155 : PCMK__META_MIGRATE_SOURCE " and "
3156 : PCMK__META_MIGRATE_TARGET, pcmk__xe_id(entry));
3157 0 : return pcmk_rc_unpack_error;
3158 : }
3159 :
3160 0 : if ((source_node != NULL)
3161 0 : && !pcmk__str_eq(*source_name, source_node->details->uname,
3162 : pcmk__str_casei|pcmk__str_null_matches)) {
3163 0 : pcmk__config_err("Ignoring resource history entry %s because "
3164 : PCMK__META_MIGRATE_SOURCE "='%s' does not match %s",
3165 : pcmk__xe_id(entry), *source_name,
3166 : pcmk__node_name(source_node));
3167 0 : return pcmk_rc_unpack_error;
3168 : }
3169 :
3170 0 : if ((target_node != NULL)
3171 0 : && !pcmk__str_eq(*target_name, target_node->details->uname,
3172 : pcmk__str_casei|pcmk__str_null_matches)) {
3173 0 : pcmk__config_err("Ignoring resource history entry %s because "
3174 : PCMK__META_MIGRATE_TARGET "='%s' does not match %s",
3175 : pcmk__xe_id(entry), *target_name,
3176 : pcmk__node_name(target_node));
3177 0 : return pcmk_rc_unpack_error;
3178 : }
3179 :
3180 0 : return pcmk_rc_ok;
3181 : }
3182 :
3183 : /*
3184 : * \internal
3185 : * \brief Add a migration source to a resource's list of dangling migrations
3186 : *
3187 : * If the migrate_to and migrate_from actions in a live migration both
3188 : * succeeded, but there is no stop on the source, the migration is considered
3189 : * "dangling." Add the source to the resource's dangling migration list, which
3190 : * will be used to schedule a stop on the source without affecting the target.
3191 : *
3192 : * \param[in,out] rsc Resource involved in migration
3193 : * \param[in] node Migration source
3194 : */
3195 : static void
3196 0 : add_dangling_migration(pcmk_resource_t *rsc, const pcmk_node_t *node)
3197 : {
3198 0 : pcmk__rsc_trace(rsc, "Dangling migration of %s requires stop on %s",
3199 : rsc->id, pcmk__node_name(node));
3200 0 : rsc->role = pcmk_role_stopped;
3201 0 : rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations,
3202 : (gpointer) node);
3203 0 : }
3204 :
3205 : /*!
3206 : * \internal
3207 : * \brief Update resource role etc. after a successful migrate_to action
3208 : *
3209 : * \param[in,out] history Parsed action result history
3210 : */
3211 : static void
3212 0 : unpack_migrate_to_success(struct action_history *history)
3213 : {
3214 : /* A complete migration sequence is:
3215 : * 1. migrate_to on source node (which succeeded if we get to this function)
3216 : * 2. migrate_from on target node
3217 : * 3. stop on source node
3218 : *
3219 : * If no migrate_from has happened, the migration is considered to be
3220 : * "partial". If the migrate_from succeeded but no stop has happened, the
3221 : * migration is considered to be "dangling".
3222 : *
3223 : * If a successful migrate_to and stop have happened on the source node, we
3224 : * still need to check for a partial migration, due to scenarios (easier to
3225 : * produce with batch-limit=1) like:
3226 : *
3227 : * - A resource is migrating from node1 to node2, and a migrate_to is
3228 : * initiated for it on node1.
3229 : *
3230 : * - node2 goes into standby mode while the migrate_to is pending, which
3231 : * aborts the transition.
3232 : *
3233 : * - Upon completion of the migrate_to, a new transition schedules a stop
3234 : * on both nodes and a start on node1.
3235 : *
3236 : * - If the new transition is aborted for any reason while the resource is
3237 : * stopping on node1, the transition after that stop completes will see
3238 : * the migrate_to and stop on the source, but it's still a partial
3239 : * migration, and the resource must be stopped on node2 because it is
3240 : * potentially active there due to the migrate_to.
3241 : *
3242 : * We also need to take into account that either node's history may be
3243 : * cleared at any point in the migration process.
3244 : */
3245 0 : int from_rc = PCMK_OCF_OK;
3246 0 : int from_status = PCMK_EXEC_PENDING;
3247 0 : pcmk_node_t *target_node = NULL;
3248 0 : xmlNode *migrate_from = NULL;
3249 0 : const char *source = NULL;
3250 0 : const char *target = NULL;
3251 0 : bool source_newer_op = false;
3252 0 : bool target_newer_state = false;
3253 0 : bool active_on_target = false;
3254 :
3255 : // Get source and target node names from XML
3256 0 : if (get_migration_node_names(history->xml, history->node, NULL, &source,
3257 : &target) != pcmk_rc_ok) {
3258 0 : return;
3259 : }
3260 :
3261 : // Check for newer state on the source
3262 0 : source_newer_op = non_monitor_after(history->rsc->id, source, history->xml,
3263 0 : true, history->rsc->cluster);
3264 :
3265 : // Check for a migrate_from action from this source on the target
3266 0 : migrate_from = find_lrm_op(history->rsc->id, PCMK_ACTION_MIGRATE_FROM,
3267 0 : target, source, -1, history->rsc->cluster);
3268 0 : if (migrate_from != NULL) {
3269 0 : if (source_newer_op) {
3270 : /* There's a newer non-monitor operation on the source and a
3271 : * migrate_from on the target, so this migrate_to is irrelevant to
3272 : * the resource's state.
3273 : */
3274 0 : return;
3275 : }
3276 0 : crm_element_value_int(migrate_from, PCMK__XA_RC_CODE, &from_rc);
3277 0 : crm_element_value_int(migrate_from, PCMK__XA_OP_STATUS, &from_status);
3278 : }
3279 :
3280 : /* If the resource has newer state on both the source and target after the
3281 : * migration events, this migrate_to is irrelevant to the resource's state.
3282 : */
3283 0 : target_newer_state = newer_state_after_migrate(history->rsc->id, target,
3284 0 : history->xml, migrate_from,
3285 0 : history->rsc->cluster);
3286 0 : if (source_newer_op && target_newer_state) {
3287 0 : return;
3288 : }
3289 :
3290 : /* Check for dangling migration (migrate_from succeeded but stop not done).
3291 : * We know there's no stop because we already returned if the target has a
3292 : * migrate_from and the source has any newer non-monitor operation.
3293 : */
3294 0 : if ((from_rc == PCMK_OCF_OK) && (from_status == PCMK_EXEC_DONE)) {
3295 0 : add_dangling_migration(history->rsc, history->node);
3296 0 : return;
3297 : }
3298 :
3299 : /* Without newer state, this migrate_to implies the resource is active.
3300 : * (Clones are not allowed to migrate, so role can't be promoted.)
3301 : */
3302 0 : history->rsc->role = pcmk_role_started;
3303 :
3304 0 : target_node = pcmk_find_node(history->rsc->cluster, target);
3305 0 : active_on_target = !target_newer_state && (target_node != NULL)
3306 0 : && target_node->details->online;
3307 :
3308 0 : if (from_status != PCMK_EXEC_PENDING) { // migrate_from failed on target
3309 0 : if (active_on_target) {
3310 0 : native_add_running(history->rsc, target_node, history->rsc->cluster,
3311 : TRUE);
3312 : } else {
3313 : // Mark resource as failed, require recovery, and prevent migration
3314 0 : pcmk__set_rsc_flags(history->rsc,
3315 : pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
3316 0 : pcmk__clear_rsc_flags(history->rsc, pcmk_rsc_migratable);
3317 : }
3318 0 : return;
3319 : }
3320 :
3321 : // The migrate_from is pending, complete but erased, or to be scheduled
3322 :
3323 : /* If there is no history at all for the resource on an online target, then
3324 : * it was likely cleaned. Just return, and we'll schedule a probe. Once we
3325 : * have the probe result, it will be reflected in target_newer_state.
3326 : */
3327 0 : if ((target_node != NULL) && target_node->details->online
3328 0 : && unknown_on_node(history->rsc, target)) {
3329 0 : return;
3330 : }
3331 :
3332 0 : if (active_on_target) {
3333 0 : pcmk_node_t *source_node = pcmk_find_node(history->rsc->cluster,
3334 : source);
3335 :
3336 0 : native_add_running(history->rsc, target_node, history->rsc->cluster,
3337 : FALSE);
3338 0 : if ((source_node != NULL) && source_node->details->online) {
3339 : /* This is a partial migration: the migrate_to completed
3340 : * successfully on the source, but the migrate_from has not
3341 : * completed. Remember the source and target; if the newly
3342 : * chosen target remains the same when we schedule actions
3343 : * later, we may continue with the migration.
3344 : */
3345 0 : history->rsc->partial_migration_target = target_node;
3346 0 : history->rsc->partial_migration_source = source_node;
3347 : }
3348 :
3349 0 : } else if (!source_newer_op) {
3350 : // Mark resource as failed, require recovery, and prevent migration
3351 0 : pcmk__set_rsc_flags(history->rsc,
3352 : pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
3353 0 : pcmk__clear_rsc_flags(history->rsc, pcmk_rsc_migratable);
3354 : }
3355 : }
3356 :
3357 : /*!
3358 : * \internal
3359 : * \brief Update resource role etc. after a failed migrate_to action
3360 : *
3361 : * \param[in,out] history Parsed action result history
3362 : */
3363 : static void
3364 0 : unpack_migrate_to_failure(struct action_history *history)
3365 : {
3366 0 : xmlNode *target_migrate_from = NULL;
3367 0 : const char *source = NULL;
3368 0 : const char *target = NULL;
3369 :
3370 : // Get source and target node names from XML
3371 0 : if (get_migration_node_names(history->xml, history->node, NULL, &source,
3372 : &target) != pcmk_rc_ok) {
3373 0 : return;
3374 : }
3375 :
3376 : /* If a migration failed, we have to assume the resource is active. Clones
3377 : * are not allowed to migrate, so role can't be promoted.
3378 : */
3379 0 : history->rsc->role = pcmk_role_started;
3380 :
3381 : // Check for migrate_from on the target
3382 0 : target_migrate_from = find_lrm_op(history->rsc->id,
3383 : PCMK_ACTION_MIGRATE_FROM, target, source,
3384 0 : PCMK_OCF_OK, history->rsc->cluster);
3385 :
3386 0 : if (/* If the resource state is unknown on the target, it will likely be
3387 : * probed there.
3388 : * Don't just consider it running there. We will get back here anyway in
3389 : * case the probe detects it's running there.
3390 : */
3391 0 : !unknown_on_node(history->rsc, target)
3392 : /* If the resource has newer state on the target after the migration
3393 : * events, this migrate_to no longer matters for the target.
3394 : */
3395 0 : && !newer_state_after_migrate(history->rsc->id, target, history->xml,
3396 : target_migrate_from,
3397 0 : history->rsc->cluster)) {
3398 : /* The resource has no newer state on the target, so assume it's still
3399 : * active there.
3400 : * (if it is up).
3401 : */
3402 0 : pcmk_node_t *target_node = pcmk_find_node(history->rsc->cluster,
3403 : target);
3404 :
3405 0 : if (target_node && target_node->details->online) {
3406 0 : native_add_running(history->rsc, target_node, history->rsc->cluster,
3407 : FALSE);
3408 : }
3409 :
3410 0 : } else if (!non_monitor_after(history->rsc->id, source, history->xml, true,
3411 0 : history->rsc->cluster)) {
3412 : /* We know the resource has newer state on the target, but this
3413 : * migrate_to still matters for the source as long as there's no newer
3414 : * non-monitor operation there.
3415 : */
3416 :
3417 : // Mark node as having dangling migration so we can force a stop later
3418 0 : history->rsc->dangling_migrations =
3419 0 : g_list_prepend(history->rsc->dangling_migrations,
3420 0 : (gpointer) history->node);
3421 : }
3422 : }
3423 :
3424 : /*!
3425 : * \internal
3426 : * \brief Update resource role etc. after a failed migrate_from action
3427 : *
3428 : * \param[in,out] history Parsed action result history
3429 : */
3430 : static void
3431 0 : unpack_migrate_from_failure(struct action_history *history)
3432 : {
3433 0 : xmlNode *source_migrate_to = NULL;
3434 0 : const char *source = NULL;
3435 0 : const char *target = NULL;
3436 :
3437 : // Get source and target node names from XML
3438 0 : if (get_migration_node_names(history->xml, NULL, history->node, &source,
3439 : &target) != pcmk_rc_ok) {
3440 0 : return;
3441 : }
3442 :
3443 : /* If a migration failed, we have to assume the resource is active. Clones
3444 : * are not allowed to migrate, so role can't be promoted.
3445 : */
3446 0 : history->rsc->role = pcmk_role_started;
3447 :
3448 : // Check for a migrate_to on the source
3449 0 : source_migrate_to = find_lrm_op(history->rsc->id, PCMK_ACTION_MIGRATE_TO,
3450 : source, target, PCMK_OCF_OK,
3451 0 : history->rsc->cluster);
3452 :
3453 0 : if (/* If the resource state is unknown on the source, it will likely be
3454 : * probed there.
3455 : * Don't just consider it running there. We will get back here anyway in
3456 : * case the probe detects it's running there.
3457 : */
3458 0 : !unknown_on_node(history->rsc, source)
3459 : /* If the resource has newer state on the source after the migration
3460 : * events, this migrate_from no longer matters for the source.
3461 : */
3462 0 : && !newer_state_after_migrate(history->rsc->id, source,
3463 0 : source_migrate_to, history->xml,
3464 0 : history->rsc->cluster)) {
3465 : /* The resource has no newer state on the source, so assume it's still
3466 : * active there (if it is up).
3467 : */
3468 0 : pcmk_node_t *source_node = pcmk_find_node(history->rsc->cluster,
3469 : source);
3470 :
3471 0 : if (source_node && source_node->details->online) {
3472 0 : native_add_running(history->rsc, source_node, history->rsc->cluster,
3473 : TRUE);
3474 : }
3475 : }
3476 : }
3477 :
3478 : /*!
3479 : * \internal
3480 : * \brief Add an action to cluster's list of failed actions
3481 : *
3482 : * \param[in,out] history Parsed action result history
3483 : */
3484 : static void
3485 0 : record_failed_op(struct action_history *history)
3486 : {
3487 0 : if (!(history->node->details->online)) {
3488 0 : return;
3489 : }
3490 :
3491 0 : for (const xmlNode *xIter = history->rsc->cluster->failed->children;
3492 0 : xIter != NULL; xIter = xIter->next) {
3493 :
3494 0 : const char *key = pcmk__xe_history_key(xIter);
3495 0 : const char *uname = crm_element_value(xIter, PCMK_XA_UNAME);
3496 :
3497 0 : if (pcmk__str_eq(history->key, key, pcmk__str_none)
3498 0 : && pcmk__str_eq(uname, history->node->details->uname,
3499 : pcmk__str_casei)) {
3500 0 : crm_trace("Skipping duplicate entry %s on %s",
3501 : history->key, pcmk__node_name(history->node));
3502 0 : return;
3503 : }
3504 : }
3505 :
3506 0 : crm_trace("Adding entry for %s on %s to failed action list",
3507 : history->key, pcmk__node_name(history->node));
3508 0 : crm_xml_add(history->xml, PCMK_XA_UNAME, history->node->details->uname);
3509 0 : crm_xml_add(history->xml, PCMK__XA_RSC_ID, history->rsc->id);
3510 0 : pcmk__xml_copy(history->rsc->cluster->failed, history->xml);
3511 : }
3512 :
3513 : static char *
3514 0 : last_change_str(const xmlNode *xml_op)
3515 : {
3516 : time_t when;
3517 0 : char *result = NULL;
3518 :
3519 0 : if (crm_element_value_epoch(xml_op, PCMK_XA_LAST_RC_CHANGE,
3520 : &when) == pcmk_ok) {
3521 0 : char *when_s = pcmk__epoch2str(&when, 0);
3522 0 : const char *p = strchr(when_s, ' ');
3523 :
3524 : // Skip day of week to make message shorter
3525 0 : if ((p != NULL) && (*(++p) != '\0')) {
3526 0 : result = pcmk__str_copy(p);
3527 : }
3528 0 : free(when_s);
3529 : }
3530 :
3531 0 : if (result == NULL) {
3532 0 : result = pcmk__str_copy("unknown_time");
3533 : }
3534 :
3535 0 : return result;
3536 : }
3537 :
3538 : /*!
3539 : * \internal
3540 : * \brief Compare two on-fail values
3541 : *
3542 : * \param[in] first One on-fail value to compare
3543 : * \param[in] second The other on-fail value to compare
3544 : *
3545 : * \return A negative number if second is more severe than first, zero if they
3546 : * are equal, or a positive number if first is more severe than second.
3547 : * \note This is only needed until the action_fail_response values can be
3548 : * renumbered at the next API compatibility break.
3549 : */
3550 : static int
3551 0 : cmp_on_fail(enum action_fail_response first, enum action_fail_response second)
3552 : {
3553 0 : switch (first) {
3554 0 : case pcmk_on_fail_demote:
3555 0 : switch (second) {
3556 0 : case pcmk_on_fail_ignore:
3557 0 : return 1;
3558 0 : case pcmk_on_fail_demote:
3559 0 : return 0;
3560 0 : default:
3561 0 : return -1;
3562 : }
3563 : break;
3564 :
3565 0 : case pcmk_on_fail_reset_remote:
3566 0 : switch (second) {
3567 0 : case pcmk_on_fail_ignore:
3568 : case pcmk_on_fail_demote:
3569 : case pcmk_on_fail_restart:
3570 0 : return 1;
3571 0 : case pcmk_on_fail_reset_remote:
3572 0 : return 0;
3573 0 : default:
3574 0 : return -1;
3575 : }
3576 : break;
3577 :
3578 0 : case pcmk_on_fail_restart_container:
3579 0 : switch (second) {
3580 0 : case pcmk_on_fail_ignore:
3581 : case pcmk_on_fail_demote:
3582 : case pcmk_on_fail_restart:
3583 : case pcmk_on_fail_reset_remote:
3584 0 : return 1;
3585 0 : case pcmk_on_fail_restart_container:
3586 0 : return 0;
3587 0 : default:
3588 0 : return -1;
3589 : }
3590 : break;
3591 :
3592 0 : default:
3593 0 : break;
3594 : }
3595 0 : switch (second) {
3596 0 : case pcmk_on_fail_demote:
3597 0 : return (first == pcmk_on_fail_ignore)? -1 : 1;
3598 :
3599 0 : case pcmk_on_fail_reset_remote:
3600 0 : switch (first) {
3601 0 : case pcmk_on_fail_ignore:
3602 : case pcmk_on_fail_demote:
3603 : case pcmk_on_fail_restart:
3604 0 : return -1;
3605 0 : default:
3606 0 : return 1;
3607 : }
3608 : break;
3609 :
3610 0 : case pcmk_on_fail_restart_container:
3611 0 : switch (first) {
3612 0 : case pcmk_on_fail_ignore:
3613 : case pcmk_on_fail_demote:
3614 : case pcmk_on_fail_restart:
3615 : case pcmk_on_fail_reset_remote:
3616 0 : return -1;
3617 0 : default:
3618 0 : return 1;
3619 : }
3620 : break;
3621 :
3622 0 : default:
3623 0 : break;
3624 : }
3625 0 : return first - second;
3626 : }
3627 :
3628 : /*!
3629 : * \internal
3630 : * \brief Ban a resource (or its clone if an anonymous instance) from all nodes
3631 : *
3632 : * \param[in,out] rsc Resource to ban
3633 : */
3634 : static void
3635 0 : ban_from_all_nodes(pcmk_resource_t *rsc)
3636 : {
3637 0 : int score = -PCMK_SCORE_INFINITY;
3638 0 : pcmk_resource_t *fail_rsc = rsc;
3639 :
3640 0 : if (fail_rsc->parent != NULL) {
3641 0 : pcmk_resource_t *parent = uber_parent(fail_rsc);
3642 :
3643 0 : if (pcmk__is_anonymous_clone(parent)) {
3644 : /* For anonymous clones, if an operation with
3645 : * PCMK_META_ON_FAIL=PCMK_VALUE_STOP fails for any instance, the
3646 : * entire clone must stop.
3647 : */
3648 0 : fail_rsc = parent;
3649 : }
3650 : }
3651 :
3652 : // Ban the resource from all nodes
3653 0 : crm_notice("%s will not be started under current conditions", fail_rsc->id);
3654 0 : if (fail_rsc->allowed_nodes != NULL) {
3655 0 : g_hash_table_destroy(fail_rsc->allowed_nodes);
3656 : }
3657 0 : fail_rsc->allowed_nodes = pe__node_list2table(rsc->cluster->nodes);
3658 0 : g_hash_table_foreach(fail_rsc->allowed_nodes, set_node_score, &score);
3659 0 : }
3660 :
3661 : /*!
3662 : * \internal
3663 : * \brief Get configured failure handling and role after failure for an action
3664 : *
3665 : * \param[in,out] history Unpacked action history entry
3666 : * \param[out] on_fail Where to set configured failure handling
3667 : * \param[out] fail_role Where to set to role after failure
3668 : */
3669 : static void
3670 0 : unpack_failure_handling(struct action_history *history,
3671 : enum action_fail_response *on_fail,
3672 : enum rsc_role_e *fail_role)
3673 : {
3674 0 : xmlNode *config = pcmk__find_action_config(history->rsc, history->task,
3675 : history->interval_ms, true);
3676 :
3677 0 : GHashTable *meta = pcmk__unpack_action_meta(history->rsc, history->node,
3678 : history->task,
3679 : history->interval_ms, config);
3680 :
3681 0 : const char *on_fail_str = g_hash_table_lookup(meta, PCMK_META_ON_FAIL);
3682 :
3683 0 : *on_fail = pcmk__parse_on_fail(history->rsc, history->task,
3684 : history->interval_ms, on_fail_str);
3685 0 : *fail_role = pcmk__role_after_failure(history->rsc, history->task, *on_fail,
3686 : meta);
3687 0 : g_hash_table_destroy(meta);
3688 0 : }
3689 :
3690 : /*!
3691 : * \internal
3692 : * \brief Update resource role, failure handling, etc., after a failed action
3693 : *
3694 : * \param[in,out] history Parsed action result history
3695 : * \param[in] config_on_fail Action failure handling from configuration
3696 : * \param[in] fail_role Resource's role after failure of this action
3697 : * \param[out] last_failure This will be set to the history XML
3698 : * \param[in,out] on_fail Actual handling of action result
3699 : */
3700 : static void
3701 0 : unpack_rsc_op_failure(struct action_history *history,
3702 : enum action_fail_response config_on_fail,
3703 : enum rsc_role_e fail_role, xmlNode **last_failure,
3704 : enum action_fail_response *on_fail)
3705 : {
3706 0 : bool is_probe = false;
3707 0 : char *last_change_s = NULL;
3708 :
3709 0 : *last_failure = history->xml;
3710 :
3711 0 : is_probe = pcmk_xe_is_probe(history->xml);
3712 0 : last_change_s = last_change_str(history->xml);
3713 :
3714 0 : if (!pcmk_is_set(history->rsc->cluster->flags, pcmk_sched_symmetric_cluster)
3715 0 : && (history->exit_status == PCMK_OCF_NOT_INSTALLED)) {
3716 0 : crm_trace("Unexpected result (%s%s%s) was recorded for "
3717 : "%s of %s on %s at %s " CRM_XS " exit-status=%d id=%s",
3718 : services_ocf_exitcode_str(history->exit_status),
3719 : (pcmk__str_empty(history->exit_reason)? "" : ": "),
3720 : pcmk__s(history->exit_reason, ""),
3721 : (is_probe? "probe" : history->task), history->rsc->id,
3722 : pcmk__node_name(history->node), last_change_s,
3723 : history->exit_status, history->id);
3724 : } else {
3725 0 : pcmk__sched_warn("Unexpected result (%s%s%s) was recorded for %s of "
3726 : "%s on %s at %s " CRM_XS " exit-status=%d id=%s",
3727 : services_ocf_exitcode_str(history->exit_status),
3728 : (pcmk__str_empty(history->exit_reason)? "" : ": "),
3729 : pcmk__s(history->exit_reason, ""),
3730 : (is_probe? "probe" : history->task), history->rsc->id,
3731 : pcmk__node_name(history->node), last_change_s,
3732 : history->exit_status, history->id);
3733 :
3734 0 : if (is_probe && (history->exit_status != PCMK_OCF_OK)
3735 0 : && (history->exit_status != PCMK_OCF_NOT_RUNNING)
3736 0 : && (history->exit_status != PCMK_OCF_RUNNING_PROMOTED)) {
3737 :
3738 : /* A failed (not just unexpected) probe result could mean the user
3739 : * didn't know resources will be probed even where they can't run.
3740 : */
3741 0 : crm_notice("If it is not possible for %s to run on %s, see "
3742 : "the " PCMK_XA_RESOURCE_DISCOVERY " option for location "
3743 : "constraints",
3744 : history->rsc->id, pcmk__node_name(history->node));
3745 : }
3746 :
3747 0 : record_failed_op(history);
3748 : }
3749 :
3750 0 : free(last_change_s);
3751 :
3752 0 : if (cmp_on_fail(*on_fail, config_on_fail) < 0) {
3753 0 : pcmk__rsc_trace(history->rsc, "on-fail %s -> %s for %s",
3754 : pcmk_on_fail_text(*on_fail),
3755 : pcmk_on_fail_text(config_on_fail), history->key);
3756 0 : *on_fail = config_on_fail;
3757 : }
3758 :
3759 0 : if (strcmp(history->task, PCMK_ACTION_STOP) == 0) {
3760 0 : resource_location(history->rsc, history->node, -PCMK_SCORE_INFINITY,
3761 0 : "__stop_fail__", history->rsc->cluster);
3762 :
3763 0 : } else if (strcmp(history->task, PCMK_ACTION_MIGRATE_TO) == 0) {
3764 0 : unpack_migrate_to_failure(history);
3765 :
3766 0 : } else if (strcmp(history->task, PCMK_ACTION_MIGRATE_FROM) == 0) {
3767 0 : unpack_migrate_from_failure(history);
3768 :
3769 0 : } else if (strcmp(history->task, PCMK_ACTION_PROMOTE) == 0) {
3770 0 : history->rsc->role = pcmk_role_promoted;
3771 :
3772 0 : } else if (strcmp(history->task, PCMK_ACTION_DEMOTE) == 0) {
3773 0 : if (config_on_fail == pcmk_on_fail_block) {
3774 0 : history->rsc->role = pcmk_role_promoted;
3775 0 : pe__set_next_role(history->rsc, pcmk_role_stopped,
3776 : "demote with " PCMK_META_ON_FAIL "=block");
3777 :
3778 0 : } else if (history->exit_status == PCMK_OCF_NOT_RUNNING) {
3779 0 : history->rsc->role = pcmk_role_stopped;
3780 :
3781 : } else {
3782 : /* Staying in the promoted role would put the scheduler and
3783 : * controller into a loop. Setting the role to unpromoted is not
3784 : * dangerous because the resource will be stopped as part of
3785 : * recovery, and any promotion will be ordered after that stop.
3786 : */
3787 0 : history->rsc->role = pcmk_role_unpromoted;
3788 : }
3789 : }
3790 :
3791 0 : if (is_probe && (history->exit_status == PCMK_OCF_NOT_INSTALLED)) {
3792 : /* leave stopped */
3793 0 : pcmk__rsc_trace(history->rsc, "Leaving %s stopped", history->rsc->id);
3794 0 : history->rsc->role = pcmk_role_stopped;
3795 :
3796 0 : } else if (history->rsc->role < pcmk_role_started) {
3797 0 : pcmk__rsc_trace(history->rsc, "Setting %s active", history->rsc->id);
3798 0 : set_active(history->rsc);
3799 : }
3800 :
3801 0 : pcmk__rsc_trace(history->rsc,
3802 : "Resource %s: role=%s unclean=%s on_fail=%s fail_role=%s",
3803 : history->rsc->id, pcmk_role_text(history->rsc->role),
3804 : pcmk__btoa(history->node->details->unclean),
3805 : pcmk_on_fail_text(config_on_fail),
3806 : pcmk_role_text(fail_role));
3807 :
3808 0 : if ((fail_role != pcmk_role_started)
3809 0 : && (history->rsc->next_role < fail_role)) {
3810 0 : pe__set_next_role(history->rsc, fail_role, "failure");
3811 : }
3812 :
3813 0 : if (fail_role == pcmk_role_stopped) {
3814 0 : ban_from_all_nodes(history->rsc);
3815 : }
3816 0 : }
3817 :
3818 : /*!
3819 : * \internal
3820 : * \brief Block a resource with a failed action if it cannot be recovered
3821 : *
3822 : * If resource action is a failed stop and fencing is not possible, mark the
3823 : * resource as unmanaged and blocked, since recovery cannot be done.
3824 : *
3825 : * \param[in,out] history Parsed action history entry
3826 : */
3827 : static void
3828 0 : block_if_unrecoverable(struct action_history *history)
3829 : {
3830 0 : char *last_change_s = NULL;
3831 :
3832 0 : if (strcmp(history->task, PCMK_ACTION_STOP) != 0) {
3833 0 : return; // All actions besides stop are always recoverable
3834 : }
3835 0 : if (pe_can_fence(history->node->details->data_set, history->node)) {
3836 0 : return; // Failed stops are recoverable via fencing
3837 : }
3838 :
3839 0 : last_change_s = last_change_str(history->xml);
3840 0 : pcmk__sched_err("No further recovery can be attempted for %s "
3841 : "because %s on %s failed (%s%s%s) at %s "
3842 : CRM_XS " rc=%d id=%s",
3843 : history->rsc->id, history->task,
3844 : pcmk__node_name(history->node),
3845 : services_ocf_exitcode_str(history->exit_status),
3846 : (pcmk__str_empty(history->exit_reason)? "" : ": "),
3847 : pcmk__s(history->exit_reason, ""),
3848 : last_change_s, history->exit_status, history->id);
3849 :
3850 0 : free(last_change_s);
3851 :
3852 0 : pcmk__clear_rsc_flags(history->rsc, pcmk_rsc_managed);
3853 0 : pcmk__set_rsc_flags(history->rsc, pcmk_rsc_blocked);
3854 : }
3855 :
3856 : /*!
3857 : * \internal
3858 : * \brief Update action history's execution status and why
3859 : *
3860 : * \param[in,out] history Parsed action history entry
3861 : * \param[out] why Where to store reason for update
3862 : * \param[in] value New value
3863 : * \param[in] reason Description of why value was changed
3864 : */
3865 : static inline void
3866 0 : remap_because(struct action_history *history, const char **why, int value,
3867 : const char *reason)
3868 : {
3869 0 : if (history->execution_status != value) {
3870 0 : history->execution_status = value;
3871 0 : *why = reason;
3872 : }
3873 0 : }
3874 :
3875 : /*!
3876 : * \internal
3877 : * \brief Remap informational monitor results and operation status
3878 : *
3879 : * For the monitor results, certain OCF codes are for providing extended information
3880 : * to the user about services that aren't yet failed but not entirely healthy either.
3881 : * These must be treated as the "normal" result by Pacemaker.
3882 : *
3883 : * For operation status, the action result can be used to determine an appropriate
3884 : * status for the purposes of responding to the action. The status provided by the
3885 : * executor is not directly usable since the executor does not know what was expected.
3886 : *
3887 : * \param[in,out] history Parsed action history entry
3888 : * \param[in,out] on_fail What should be done about the result
3889 : * \param[in] expired Whether result is expired
3890 : *
3891 : * \note If the result is remapped and the node is not shutting down or failed,
3892 : * the operation will be recorded in the scheduler data's list of failed
3893 : * operations to highlight it for the user.
3894 : *
3895 : * \note This may update the resource's current and next role.
3896 : */
3897 : static void
3898 0 : remap_operation(struct action_history *history,
3899 : enum action_fail_response *on_fail, bool expired)
3900 : {
3901 0 : bool is_probe = false;
3902 0 : int orig_exit_status = history->exit_status;
3903 0 : int orig_exec_status = history->execution_status;
3904 0 : const char *why = NULL;
3905 0 : const char *task = history->task;
3906 :
3907 : // Remap degraded results to their successful counterparts
3908 0 : history->exit_status = pcmk__effective_rc(history->exit_status);
3909 0 : if (history->exit_status != orig_exit_status) {
3910 0 : why = "degraded result";
3911 0 : if (!expired && (!history->node->details->shutdown
3912 0 : || history->node->details->online)) {
3913 0 : record_failed_op(history);
3914 : }
3915 : }
3916 :
3917 0 : if (!pcmk__is_bundled(history->rsc)
3918 0 : && pcmk_xe_mask_probe_failure(history->xml)
3919 0 : && ((history->execution_status != PCMK_EXEC_DONE)
3920 0 : || (history->exit_status != PCMK_OCF_NOT_RUNNING))) {
3921 0 : history->execution_status = PCMK_EXEC_DONE;
3922 0 : history->exit_status = PCMK_OCF_NOT_RUNNING;
3923 0 : why = "equivalent probe result";
3924 : }
3925 :
3926 : /* If the executor reported an execution status of anything but done or
3927 : * error, consider that final. But for done or error, we know better whether
3928 : * it should be treated as a failure or not, because we know the expected
3929 : * result.
3930 : */
3931 0 : switch (history->execution_status) {
3932 0 : case PCMK_EXEC_DONE:
3933 : case PCMK_EXEC_ERROR:
3934 0 : break;
3935 :
3936 : // These should be treated as node-fatal
3937 0 : case PCMK_EXEC_NO_FENCE_DEVICE:
3938 : case PCMK_EXEC_NO_SECRETS:
3939 0 : remap_because(history, &why, PCMK_EXEC_ERROR_HARD,
3940 : "node-fatal error");
3941 0 : goto remap_done;
3942 :
3943 0 : default:
3944 0 : goto remap_done;
3945 : }
3946 :
3947 0 : is_probe = pcmk_xe_is_probe(history->xml);
3948 0 : if (is_probe) {
3949 0 : task = "probe";
3950 : }
3951 :
3952 0 : if (history->expected_exit_status < 0) {
3953 : /* Pre-1.0 Pacemaker versions, and Pacemaker 1.1.6 or earlier with
3954 : * Heartbeat 2.0.7 or earlier as the cluster layer, did not include the
3955 : * expected exit status in the transition key, which (along with the
3956 : * similar case of a corrupted transition key in the CIB) will be
3957 : * reported to this function as -1. Pacemaker 2.0+ does not support
3958 : * rolling upgrades from those versions or processing of saved CIB files
3959 : * from those versions, so we do not need to care much about this case.
3960 : */
3961 0 : remap_because(history, &why, PCMK_EXEC_ERROR,
3962 : "obsolete history format");
3963 0 : pcmk__config_warn("Expected result not found for %s on %s "
3964 : "(corrupt or obsolete CIB?)",
3965 : history->key, pcmk__node_name(history->node));
3966 :
3967 0 : } else if (history->exit_status == history->expected_exit_status) {
3968 0 : remap_because(history, &why, PCMK_EXEC_DONE, "expected result");
3969 :
3970 : } else {
3971 0 : remap_because(history, &why, PCMK_EXEC_ERROR, "unexpected result");
3972 0 : pcmk__rsc_debug(history->rsc,
3973 : "%s on %s: expected %d (%s), got %d (%s%s%s)",
3974 : history->key, pcmk__node_name(history->node),
3975 : history->expected_exit_status,
3976 : services_ocf_exitcode_str(history->expected_exit_status),
3977 : history->exit_status,
3978 : services_ocf_exitcode_str(history->exit_status),
3979 : (pcmk__str_empty(history->exit_reason)? "" : ": "),
3980 : pcmk__s(history->exit_reason, ""));
3981 : }
3982 :
3983 0 : switch (history->exit_status) {
3984 0 : case PCMK_OCF_OK:
3985 0 : if (is_probe
3986 0 : && (history->expected_exit_status == PCMK_OCF_NOT_RUNNING)) {
3987 0 : char *last_change_s = last_change_str(history->xml);
3988 :
3989 0 : remap_because(history, &why, PCMK_EXEC_DONE, "probe");
3990 0 : pcmk__rsc_info(history->rsc,
3991 : "Probe found %s active on %s at %s",
3992 : history->rsc->id, pcmk__node_name(history->node),
3993 : last_change_s);
3994 0 : free(last_change_s);
3995 : }
3996 0 : break;
3997 :
3998 0 : case PCMK_OCF_NOT_RUNNING:
3999 0 : if (is_probe
4000 0 : || (history->expected_exit_status == history->exit_status)
4001 0 : || !pcmk_is_set(history->rsc->flags, pcmk_rsc_managed)) {
4002 :
4003 : /* For probes, recurring monitors for the Stopped role, and
4004 : * unmanaged resources, "not running" is not considered a
4005 : * failure.
4006 : */
4007 0 : remap_because(history, &why, PCMK_EXEC_DONE, "exit status");
4008 0 : history->rsc->role = pcmk_role_stopped;
4009 0 : *on_fail = pcmk_on_fail_ignore;
4010 0 : pe__set_next_role(history->rsc, pcmk_role_unknown,
4011 : "not running");
4012 : }
4013 0 : break;
4014 :
4015 0 : case PCMK_OCF_RUNNING_PROMOTED:
4016 0 : if (is_probe
4017 0 : && (history->exit_status != history->expected_exit_status)) {
4018 0 : char *last_change_s = last_change_str(history->xml);
4019 :
4020 0 : remap_because(history, &why, PCMK_EXEC_DONE, "probe");
4021 0 : pcmk__rsc_info(history->rsc,
4022 : "Probe found %s active and promoted on %s at %s",
4023 : history->rsc->id,
4024 : pcmk__node_name(history->node), last_change_s);
4025 0 : free(last_change_s);
4026 : }
4027 0 : if (!expired
4028 0 : || (history->exit_status == history->expected_exit_status)) {
4029 0 : history->rsc->role = pcmk_role_promoted;
4030 : }
4031 0 : break;
4032 :
4033 0 : case PCMK_OCF_FAILED_PROMOTED:
4034 0 : if (!expired) {
4035 0 : history->rsc->role = pcmk_role_promoted;
4036 : }
4037 0 : remap_because(history, &why, PCMK_EXEC_ERROR, "exit status");
4038 0 : break;
4039 :
4040 0 : case PCMK_OCF_NOT_CONFIGURED:
4041 0 : remap_because(history, &why, PCMK_EXEC_ERROR_FATAL, "exit status");
4042 0 : break;
4043 :
4044 0 : case PCMK_OCF_UNIMPLEMENT_FEATURE:
4045 : {
4046 0 : guint interval_ms = 0;
4047 0 : crm_element_value_ms(history->xml, PCMK_META_INTERVAL,
4048 : &interval_ms);
4049 :
4050 0 : if (interval_ms == 0) {
4051 0 : if (!expired) {
4052 0 : block_if_unrecoverable(history);
4053 : }
4054 0 : remap_because(history, &why, PCMK_EXEC_ERROR_HARD,
4055 : "exit status");
4056 : } else {
4057 0 : remap_because(history, &why, PCMK_EXEC_NOT_SUPPORTED,
4058 : "exit status");
4059 : }
4060 : }
4061 0 : break;
4062 :
4063 0 : case PCMK_OCF_NOT_INSTALLED:
4064 : case PCMK_OCF_INVALID_PARAM:
4065 : case PCMK_OCF_INSUFFICIENT_PRIV:
4066 0 : if (!expired) {
4067 0 : block_if_unrecoverable(history);
4068 : }
4069 0 : remap_because(history, &why, PCMK_EXEC_ERROR_HARD, "exit status");
4070 0 : break;
4071 :
4072 0 : default:
4073 0 : if (history->execution_status == PCMK_EXEC_DONE) {
4074 0 : char *last_change_s = last_change_str(history->xml);
4075 :
4076 0 : crm_info("Treating unknown exit status %d from %s of %s "
4077 : "on %s at %s as failure",
4078 : history->exit_status, task, history->rsc->id,
4079 : pcmk__node_name(history->node), last_change_s);
4080 0 : remap_because(history, &why, PCMK_EXEC_ERROR,
4081 : "unknown exit status");
4082 0 : free(last_change_s);
4083 : }
4084 0 : break;
4085 : }
4086 :
4087 0 : remap_done:
4088 0 : if (why != NULL) {
4089 0 : pcmk__rsc_trace(history->rsc,
4090 : "Remapped %s result from [%s: %s] to [%s: %s] "
4091 : "because of %s",
4092 : history->key, pcmk_exec_status_str(orig_exec_status),
4093 : crm_exit_str(orig_exit_status),
4094 : pcmk_exec_status_str(history->execution_status),
4095 : crm_exit_str(history->exit_status), why);
4096 : }
4097 0 : }
4098 :
4099 : // return TRUE if start or monitor last failure but parameters changed
4100 : static bool
4101 0 : should_clear_for_param_change(const xmlNode *xml_op, const char *task,
4102 : pcmk_resource_t *rsc, pcmk_node_t *node)
4103 : {
4104 0 : if (pcmk__str_any_of(task, PCMK_ACTION_START, PCMK_ACTION_MONITOR, NULL)) {
4105 0 : if (pe__bundle_needs_remote_name(rsc)) {
4106 : /* We haven't allocated resources yet, so we can't reliably
4107 : * substitute addr parameters for the REMOTE_CONTAINER_HACK.
4108 : * When that's needed, defer the check until later.
4109 : */
4110 0 : pe__add_param_check(xml_op, rsc, node, pcmk__check_last_failure,
4111 : rsc->cluster);
4112 :
4113 : } else {
4114 0 : pcmk__op_digest_t *digest_data = NULL;
4115 :
4116 0 : digest_data = rsc_action_digest_cmp(rsc, xml_op, node,
4117 : rsc->cluster);
4118 0 : switch (digest_data->rc) {
4119 0 : case pcmk__digest_unknown:
4120 0 : crm_trace("Resource %s history entry %s on %s"
4121 : " has no digest to compare",
4122 : rsc->id, pcmk__xe_history_key(xml_op),
4123 : node->details->id);
4124 0 : break;
4125 0 : case pcmk__digest_match:
4126 0 : break;
4127 0 : default:
4128 0 : return TRUE;
4129 : }
4130 : }
4131 : }
4132 0 : return FALSE;
4133 : }
4134 :
4135 : // Order action after fencing of remote node, given connection rsc
4136 : static void
4137 0 : order_after_remote_fencing(pcmk_action_t *action, pcmk_resource_t *remote_conn,
4138 : pcmk_scheduler_t *scheduler)
4139 : {
4140 0 : pcmk_node_t *remote_node = pcmk_find_node(scheduler, remote_conn->id);
4141 :
4142 0 : if (remote_node) {
4143 0 : pcmk_action_t *fence = pe_fence_op(remote_node, NULL, TRUE, NULL,
4144 : FALSE, scheduler);
4145 :
4146 0 : order_actions(fence, action, pcmk__ar_first_implies_then);
4147 : }
4148 0 : }
4149 :
4150 : static bool
4151 0 : should_ignore_failure_timeout(const pcmk_resource_t *rsc, const char *task,
4152 : guint interval_ms, bool is_last_failure)
4153 : {
4154 : /* Clearing failures of recurring monitors has special concerns. The
4155 : * executor reports only changes in the monitor result, so if the
4156 : * monitor is still active and still getting the same failure result,
4157 : * that will go undetected after the failure is cleared.
4158 : *
4159 : * Also, the operation history will have the time when the recurring
4160 : * monitor result changed to the given code, not the time when the
4161 : * result last happened.
4162 : *
4163 : * @TODO We probably should clear such failures only when the failure
4164 : * timeout has passed since the last occurrence of the failed result.
4165 : * However we don't record that information. We could maybe approximate
4166 : * that by clearing only if there is a more recent successful monitor or
4167 : * stop result, but we don't even have that information at this point
4168 : * since we are still unpacking the resource's operation history.
4169 : *
4170 : * This is especially important for remote connection resources with a
4171 : * reconnect interval, so in that case, we skip clearing failures
4172 : * if the remote node hasn't been fenced.
4173 : */
4174 0 : if (rsc->remote_reconnect_ms
4175 0 : && pcmk_is_set(rsc->cluster->flags, pcmk_sched_fencing_enabled)
4176 0 : && (interval_ms != 0)
4177 0 : && pcmk__str_eq(task, PCMK_ACTION_MONITOR, pcmk__str_casei)) {
4178 :
4179 0 : pcmk_node_t *remote_node = pcmk_find_node(rsc->cluster, rsc->id);
4180 :
4181 0 : if (remote_node && !remote_node->details->remote_was_fenced) {
4182 0 : if (is_last_failure) {
4183 0 : crm_info("Waiting to clear monitor failure for remote node %s"
4184 : " until fencing has occurred", rsc->id);
4185 : }
4186 0 : return TRUE;
4187 : }
4188 : }
4189 0 : return FALSE;
4190 : }
4191 :
4192 : /*!
4193 : * \internal
4194 : * \brief Check operation age and schedule failure clearing when appropriate
4195 : *
4196 : * This function has two distinct purposes. The first is to check whether an
4197 : * operation history entry is expired (i.e. the resource has a failure timeout,
4198 : * the entry is older than the timeout, and the resource either has no fail
4199 : * count or its fail count is entirely older than the timeout). The second is to
4200 : * schedule fail count clearing when appropriate (i.e. the operation is expired
4201 : * and either the resource has an expired fail count or the operation is a
4202 : * last_failure for a remote connection resource with a reconnect interval,
4203 : * or the operation is a last_failure for a start or monitor operation and the
4204 : * resource's parameters have changed since the operation).
4205 : *
4206 : * \param[in,out] history Parsed action result history
4207 : *
4208 : * \return true if operation history entry is expired, otherwise false
4209 : */
4210 : static bool
4211 0 : check_operation_expiry(struct action_history *history)
4212 : {
4213 0 : bool expired = false;
4214 0 : bool is_last_failure = pcmk__ends_with(history->id, "_last_failure_0");
4215 0 : time_t last_run = 0;
4216 0 : int unexpired_fail_count = 0;
4217 0 : const char *clear_reason = NULL;
4218 :
4219 0 : if (history->execution_status == PCMK_EXEC_NOT_INSTALLED) {
4220 0 : pcmk__rsc_trace(history->rsc,
4221 : "Resource history entry %s on %s is not expired: "
4222 : "Not Installed does not expire",
4223 : history->id, pcmk__node_name(history->node));
4224 0 : return false; // "Not installed" must always be cleared manually
4225 : }
4226 :
4227 0 : if ((history->rsc->failure_timeout > 0)
4228 0 : && (crm_element_value_epoch(history->xml, PCMK_XA_LAST_RC_CHANGE,
4229 : &last_run) == 0)) {
4230 :
4231 : /* Resource has a PCMK_META_FAILURE_TIMEOUT and history entry has a
4232 : * timestamp
4233 : */
4234 :
4235 0 : time_t now = get_effective_time(history->rsc->cluster);
4236 0 : time_t last_failure = 0;
4237 :
4238 : // Is this particular operation history older than the failure timeout?
4239 0 : if ((now >= (last_run + history->rsc->failure_timeout))
4240 0 : && !should_ignore_failure_timeout(history->rsc, history->task,
4241 : history->interval_ms,
4242 : is_last_failure)) {
4243 0 : expired = true;
4244 : }
4245 :
4246 : // Does the resource as a whole have an unexpired fail count?
4247 0 : unexpired_fail_count = pe_get_failcount(history->node, history->rsc,
4248 : &last_failure,
4249 : pcmk__fc_effective,
4250 0 : history->xml);
4251 :
4252 : // Update scheduler recheck time according to *last* failure
4253 0 : crm_trace("%s@%lld is %sexpired @%lld with unexpired_failures=%d timeout=%ds"
4254 : " last-failure@%lld",
4255 : history->id, (long long) last_run, (expired? "" : "not "),
4256 : (long long) now, unexpired_fail_count,
4257 : history->rsc->failure_timeout, (long long) last_failure);
4258 0 : last_failure += history->rsc->failure_timeout + 1;
4259 0 : if (unexpired_fail_count && (now < last_failure)) {
4260 0 : pe__update_recheck_time(last_failure, history->rsc->cluster,
4261 : "fail count expiration");
4262 : }
4263 : }
4264 :
4265 0 : if (expired) {
4266 0 : if (pe_get_failcount(history->node, history->rsc, NULL,
4267 0 : pcmk__fc_default, history->xml)) {
4268 : // There is a fail count ignoring timeout
4269 :
4270 0 : if (unexpired_fail_count == 0) {
4271 : // There is no fail count considering timeout
4272 0 : clear_reason = "it expired";
4273 :
4274 : } else {
4275 : /* This operation is old, but there is an unexpired fail count.
4276 : * In a properly functioning cluster, this should only be
4277 : * possible if this operation is not a failure (otherwise the
4278 : * fail count should be expired too), so this is really just a
4279 : * failsafe.
4280 : */
4281 0 : pcmk__rsc_trace(history->rsc,
4282 : "Resource history entry %s on %s is not "
4283 : "expired: Unexpired fail count",
4284 : history->id, pcmk__node_name(history->node));
4285 0 : expired = false;
4286 : }
4287 :
4288 0 : } else if (is_last_failure
4289 0 : && (history->rsc->remote_reconnect_ms != 0)) {
4290 : /* Clear any expired last failure when reconnect interval is set,
4291 : * even if there is no fail count.
4292 : */
4293 0 : clear_reason = "reconnect interval is set";
4294 : }
4295 : }
4296 :
4297 0 : if (!expired && is_last_failure
4298 0 : && should_clear_for_param_change(history->xml, history->task,
4299 : history->rsc, history->node)) {
4300 0 : clear_reason = "resource parameters have changed";
4301 : }
4302 :
4303 0 : if (clear_reason != NULL) {
4304 0 : pcmk_action_t *clear_op = NULL;
4305 :
4306 : // Schedule clearing of the fail count
4307 0 : clear_op = pe__clear_failcount(history->rsc, history->node,
4308 0 : clear_reason, history->rsc->cluster);
4309 :
4310 0 : if (pcmk_is_set(history->rsc->cluster->flags,
4311 : pcmk_sched_fencing_enabled)
4312 0 : && (history->rsc->remote_reconnect_ms != 0)) {
4313 : /* If we're clearing a remote connection due to a reconnect
4314 : * interval, we want to wait until any scheduled fencing
4315 : * completes.
4316 : *
4317 : * We could limit this to remote_node->details->unclean, but at
4318 : * this point, that's always true (it won't be reliable until
4319 : * after unpack_node_history() is done).
4320 : */
4321 0 : crm_info("Clearing %s failure will wait until any scheduled "
4322 : "fencing of %s completes",
4323 : history->task, history->rsc->id);
4324 0 : order_after_remote_fencing(clear_op, history->rsc,
4325 0 : history->rsc->cluster);
4326 : }
4327 : }
4328 :
4329 0 : if (expired && (history->interval_ms == 0)
4330 0 : && pcmk__str_eq(history->task, PCMK_ACTION_MONITOR, pcmk__str_none)) {
4331 0 : switch (history->exit_status) {
4332 0 : case PCMK_OCF_OK:
4333 : case PCMK_OCF_NOT_RUNNING:
4334 : case PCMK_OCF_RUNNING_PROMOTED:
4335 : case PCMK_OCF_DEGRADED:
4336 : case PCMK_OCF_DEGRADED_PROMOTED:
4337 : // Don't expire probes that return these values
4338 0 : pcmk__rsc_trace(history->rsc,
4339 : "Resource history entry %s on %s is not "
4340 : "expired: Probe result",
4341 : history->id, pcmk__node_name(history->node));
4342 0 : expired = false;
4343 0 : break;
4344 : }
4345 : }
4346 :
4347 0 : return expired;
4348 : }
4349 :
4350 : int
4351 0 : pe__target_rc_from_xml(const xmlNode *xml_op)
4352 : {
4353 0 : int target_rc = 0;
4354 0 : const char *key = crm_element_value(xml_op, PCMK__XA_TRANSITION_KEY);
4355 :
4356 0 : if (key == NULL) {
4357 0 : return -1;
4358 : }
4359 0 : decode_transition_key(key, NULL, NULL, NULL, &target_rc);
4360 0 : return target_rc;
4361 : }
4362 :
4363 : /*!
4364 : * \internal
4365 : * \brief Update a resource's state for an action result
4366 : *
4367 : * \param[in,out] history Parsed action history entry
4368 : * \param[in] exit_status Exit status to base new state on
4369 : * \param[in] last_failure Resource's last_failure entry, if known
4370 : * \param[in,out] on_fail Resource's current failure handling
4371 : */
4372 : static void
4373 0 : update_resource_state(struct action_history *history, int exit_status,
4374 : const xmlNode *last_failure,
4375 : enum action_fail_response *on_fail)
4376 : {
4377 0 : bool clear_past_failure = false;
4378 :
4379 0 : if ((exit_status == PCMK_OCF_NOT_INSTALLED)
4380 0 : || (!pcmk__is_bundled(history->rsc)
4381 0 : && pcmk_xe_mask_probe_failure(history->xml))) {
4382 0 : history->rsc->role = pcmk_role_stopped;
4383 :
4384 0 : } else if (exit_status == PCMK_OCF_NOT_RUNNING) {
4385 0 : clear_past_failure = true;
4386 :
4387 0 : } else if (pcmk__str_eq(history->task, PCMK_ACTION_MONITOR,
4388 : pcmk__str_none)) {
4389 0 : if ((last_failure != NULL)
4390 0 : && pcmk__str_eq(history->key, pcmk__xe_history_key(last_failure),
4391 : pcmk__str_none)) {
4392 0 : clear_past_failure = true;
4393 : }
4394 0 : if (history->rsc->role < pcmk_role_started) {
4395 0 : set_active(history->rsc);
4396 : }
4397 :
4398 0 : } else if (pcmk__str_eq(history->task, PCMK_ACTION_START, pcmk__str_none)) {
4399 0 : history->rsc->role = pcmk_role_started;
4400 0 : clear_past_failure = true;
4401 :
4402 0 : } else if (pcmk__str_eq(history->task, PCMK_ACTION_STOP, pcmk__str_none)) {
4403 0 : history->rsc->role = pcmk_role_stopped;
4404 0 : clear_past_failure = true;
4405 :
4406 0 : } else if (pcmk__str_eq(history->task, PCMK_ACTION_PROMOTE,
4407 : pcmk__str_none)) {
4408 0 : history->rsc->role = pcmk_role_promoted;
4409 0 : clear_past_failure = true;
4410 :
4411 0 : } else if (pcmk__str_eq(history->task, PCMK_ACTION_DEMOTE,
4412 : pcmk__str_none)) {
4413 0 : if (*on_fail == pcmk_on_fail_demote) {
4414 : /* Demote clears an error only if
4415 : * PCMK_META_ON_FAIL=PCMK_VALUE_DEMOTE
4416 : */
4417 0 : clear_past_failure = true;
4418 : }
4419 0 : history->rsc->role = pcmk_role_unpromoted;
4420 :
4421 0 : } else if (pcmk__str_eq(history->task, PCMK_ACTION_MIGRATE_FROM,
4422 : pcmk__str_none)) {
4423 0 : history->rsc->role = pcmk_role_started;
4424 0 : clear_past_failure = true;
4425 :
4426 0 : } else if (pcmk__str_eq(history->task, PCMK_ACTION_MIGRATE_TO,
4427 : pcmk__str_none)) {
4428 0 : unpack_migrate_to_success(history);
4429 :
4430 0 : } else if (history->rsc->role < pcmk_role_started) {
4431 0 : pcmk__rsc_trace(history->rsc, "%s active on %s",
4432 : history->rsc->id, pcmk__node_name(history->node));
4433 0 : set_active(history->rsc);
4434 : }
4435 :
4436 0 : if (!clear_past_failure) {
4437 0 : return;
4438 : }
4439 :
4440 0 : switch (*on_fail) {
4441 0 : case pcmk_on_fail_stop:
4442 : case pcmk_on_fail_ban:
4443 : case pcmk_on_fail_standby_node:
4444 : case pcmk_on_fail_fence_node:
4445 0 : pcmk__rsc_trace(history->rsc,
4446 : "%s (%s) is not cleared by a completed %s",
4447 : history->rsc->id, pcmk_on_fail_text(*on_fail),
4448 : history->task);
4449 0 : break;
4450 :
4451 0 : case pcmk_on_fail_block:
4452 : case pcmk_on_fail_ignore:
4453 : case pcmk_on_fail_demote:
4454 : case pcmk_on_fail_restart:
4455 : case pcmk_on_fail_restart_container:
4456 0 : *on_fail = pcmk_on_fail_ignore;
4457 0 : pe__set_next_role(history->rsc, pcmk_role_unknown,
4458 : "clear past failures");
4459 0 : break;
4460 :
4461 0 : case pcmk_on_fail_reset_remote:
4462 0 : if (history->rsc->remote_reconnect_ms == 0) {
4463 : /* With no reconnect interval, the connection is allowed to
4464 : * start again after the remote node is fenced and
4465 : * completely stopped. (With a reconnect interval, we wait
4466 : * for the failure to be cleared entirely before attempting
4467 : * to reconnect.)
4468 : */
4469 0 : *on_fail = pcmk_on_fail_ignore;
4470 0 : pe__set_next_role(history->rsc, pcmk_role_unknown,
4471 : "clear past failures and reset remote");
4472 : }
4473 0 : break;
4474 : }
4475 : }
4476 :
4477 : /*!
4478 : * \internal
4479 : * \brief Check whether a given history entry matters for resource state
4480 : *
4481 : * \param[in] history Parsed action history entry
4482 : *
4483 : * \return true if action can affect resource state, otherwise false
4484 : */
4485 : static inline bool
4486 0 : can_affect_state(struct action_history *history)
4487 : {
4488 : #if 0
4489 : /* @COMPAT It might be better to parse only actions we know we're interested
4490 : * in, rather than exclude a couple we don't. However that would be a
4491 : * behavioral change that should be done at a major or minor series release.
4492 : * Currently, unknown operations can affect whether a resource is considered
4493 : * active and/or failed.
4494 : */
4495 : return pcmk__str_any_of(history->task, PCMK_ACTION_MONITOR,
4496 : PCMK_ACTION_START, PCMK_ACTION_STOP,
4497 : PCMK_ACTION_PROMOTE, PCMK_ACTION_DEMOTE,
4498 : PCMK_ACTION_MIGRATE_TO, PCMK_ACTION_MIGRATE_FROM,
4499 : "asyncmon", NULL);
4500 : #else
4501 0 : return !pcmk__str_any_of(history->task, PCMK_ACTION_NOTIFY,
4502 0 : PCMK_ACTION_META_DATA, NULL);
4503 : #endif
4504 : }
4505 :
4506 : /*!
4507 : * \internal
4508 : * \brief Unpack execution/exit status and exit reason from a history entry
4509 : *
4510 : * \param[in,out] history Action history entry to unpack
4511 : *
4512 : * \return Standard Pacemaker return code
4513 : */
4514 : static int
4515 0 : unpack_action_result(struct action_history *history)
4516 : {
4517 0 : if ((crm_element_value_int(history->xml, PCMK__XA_OP_STATUS,
4518 : &(history->execution_status)) < 0)
4519 0 : || (history->execution_status < PCMK_EXEC_PENDING)
4520 0 : || (history->execution_status > PCMK_EXEC_MAX)
4521 0 : || (history->execution_status == PCMK_EXEC_CANCELLED)) {
4522 0 : pcmk__config_err("Ignoring resource history entry %s for %s on %s "
4523 : "with invalid " PCMK__XA_OP_STATUS " '%s'",
4524 : history->id, history->rsc->id,
4525 : pcmk__node_name(history->node),
4526 : pcmk__s(crm_element_value(history->xml,
4527 : PCMK__XA_OP_STATUS),
4528 : ""));
4529 0 : return pcmk_rc_unpack_error;
4530 : }
4531 0 : if ((crm_element_value_int(history->xml, PCMK__XA_RC_CODE,
4532 : &(history->exit_status)) < 0)
4533 0 : || (history->exit_status < 0) || (history->exit_status > CRM_EX_MAX)) {
4534 : #if 0
4535 : /* @COMPAT We should ignore malformed entries, but since that would
4536 : * change behavior, it should be done at a major or minor series
4537 : * release.
4538 : */
4539 : pcmk__config_err("Ignoring resource history entry %s for %s on %s "
4540 : "with invalid " PCMK__XA_RC_CODE " '%s'",
4541 : history->id, history->rsc->id,
4542 : pcmk__node_name(history->node),
4543 : pcmk__s(crm_element_value(history->xml,
4544 : PCMK__XA_RC_CODE),
4545 : ""));
4546 : return pcmk_rc_unpack_error;
4547 : #else
4548 0 : history->exit_status = CRM_EX_ERROR;
4549 : #endif
4550 : }
4551 0 : history->exit_reason = crm_element_value(history->xml, PCMK_XA_EXIT_REASON);
4552 0 : return pcmk_rc_ok;
4553 : }
4554 :
4555 : /*!
4556 : * \internal
4557 : * \brief Process an action history entry whose result expired
4558 : *
4559 : * \param[in,out] history Parsed action history entry
4560 : * \param[in] orig_exit_status Action exit status before remapping
4561 : *
4562 : * \return Standard Pacemaker return code (in particular, pcmk_rc_ok means the
4563 : * entry needs no further processing)
4564 : */
4565 : static int
4566 0 : process_expired_result(struct action_history *history, int orig_exit_status)
4567 : {
4568 0 : if (!pcmk__is_bundled(history->rsc)
4569 0 : && pcmk_xe_mask_probe_failure(history->xml)
4570 0 : && (orig_exit_status != history->expected_exit_status)) {
4571 :
4572 0 : if (history->rsc->role <= pcmk_role_stopped) {
4573 0 : history->rsc->role = pcmk_role_unknown;
4574 : }
4575 0 : crm_trace("Ignoring resource history entry %s for probe of %s on %s: "
4576 : "Masked failure expired",
4577 : history->id, history->rsc->id,
4578 : pcmk__node_name(history->node));
4579 0 : return pcmk_rc_ok;
4580 : }
4581 :
4582 0 : if (history->exit_status == history->expected_exit_status) {
4583 0 : return pcmk_rc_undetermined; // Only failures expire
4584 : }
4585 :
4586 0 : if (history->interval_ms == 0) {
4587 0 : crm_notice("Ignoring resource history entry %s for %s of %s on %s: "
4588 : "Expired failure",
4589 : history->id, history->task, history->rsc->id,
4590 : pcmk__node_name(history->node));
4591 0 : return pcmk_rc_ok;
4592 : }
4593 :
4594 0 : if (history->node->details->online && !history->node->details->unclean) {
4595 : /* Reschedule the recurring action. schedule_cancel() won't work at
4596 : * this stage, so as a hacky workaround, forcibly change the restart
4597 : * digest so pcmk__check_action_config() does what we want later.
4598 : *
4599 : * @TODO We should skip this if there is a newer successful monitor.
4600 : * Also, this causes rescheduling only if the history entry
4601 : * has a PCMK__XA_OP_DIGEST (which the expire-non-blocked-failure
4602 : * scheduler regression test doesn't, but that may not be a
4603 : * realistic scenario in production).
4604 : */
4605 0 : crm_notice("Rescheduling %s-interval %s of %s on %s "
4606 : "after failure expired",
4607 : pcmk__readable_interval(history->interval_ms), history->task,
4608 : history->rsc->id, pcmk__node_name(history->node));
4609 0 : crm_xml_add(history->xml, PCMK__XA_OP_RESTART_DIGEST,
4610 : "calculated-failure-timeout");
4611 0 : return pcmk_rc_ok;
4612 : }
4613 :
4614 0 : return pcmk_rc_undetermined;
4615 : }
4616 :
4617 : /*!
4618 : * \internal
4619 : * \brief Process a masked probe failure
4620 : *
4621 : * \param[in,out] history Parsed action history entry
4622 : * \param[in] orig_exit_status Action exit status before remapping
4623 : * \param[in] last_failure Resource's last_failure entry, if known
4624 : * \param[in,out] on_fail Resource's current failure handling
4625 : */
4626 : static void
4627 0 : mask_probe_failure(struct action_history *history, int orig_exit_status,
4628 : const xmlNode *last_failure,
4629 : enum action_fail_response *on_fail)
4630 : {
4631 0 : pcmk_resource_t *ban_rsc = history->rsc;
4632 :
4633 0 : if (!pcmk_is_set(history->rsc->flags, pcmk_rsc_unique)) {
4634 0 : ban_rsc = uber_parent(history->rsc);
4635 : }
4636 :
4637 0 : crm_notice("Treating probe result '%s' for %s on %s as 'not running'",
4638 : services_ocf_exitcode_str(orig_exit_status), history->rsc->id,
4639 : pcmk__node_name(history->node));
4640 0 : update_resource_state(history, history->expected_exit_status, last_failure,
4641 : on_fail);
4642 0 : crm_xml_add(history->xml, PCMK_XA_UNAME, history->node->details->uname);
4643 :
4644 0 : record_failed_op(history);
4645 0 : resource_location(ban_rsc, history->node, -PCMK_SCORE_INFINITY,
4646 0 : "masked-probe-failure", history->rsc->cluster);
4647 0 : }
4648 :
4649 : /*!
4650 : * \internal Check whether a given failure is for a given pending action
4651 : *
4652 : * \param[in] history Parsed history entry for pending action
4653 : * \param[in] last_failure Resource's last_failure entry, if known
4654 : *
4655 : * \return true if \p last_failure is failure of pending action in \p history,
4656 : * otherwise false
4657 : * \note Both \p history and \p last_failure must come from the same
4658 : * \c PCMK__XE_LRM_RESOURCE block, as node and resource are assumed to be
4659 : * the same.
4660 : */
4661 : static bool
4662 0 : failure_is_newer(const struct action_history *history,
4663 : const xmlNode *last_failure)
4664 : {
4665 0 : guint failure_interval_ms = 0U;
4666 0 : long long failure_change = 0LL;
4667 0 : long long this_change = 0LL;
4668 :
4669 0 : if (last_failure == NULL) {
4670 0 : return false; // Resource has no last_failure entry
4671 : }
4672 :
4673 0 : if (!pcmk__str_eq(history->task,
4674 : crm_element_value(last_failure, PCMK_XA_OPERATION),
4675 : pcmk__str_none)) {
4676 0 : return false; // last_failure is for different action
4677 : }
4678 :
4679 0 : if ((crm_element_value_ms(last_failure, PCMK_META_INTERVAL,
4680 : &failure_interval_ms) != pcmk_ok)
4681 0 : || (history->interval_ms != failure_interval_ms)) {
4682 0 : return false; // last_failure is for action with different interval
4683 : }
4684 :
4685 0 : if ((pcmk__scan_ll(crm_element_value(history->xml, PCMK_XA_LAST_RC_CHANGE),
4686 : &this_change, 0LL) != pcmk_rc_ok)
4687 0 : || (pcmk__scan_ll(crm_element_value(last_failure,
4688 : PCMK_XA_LAST_RC_CHANGE),
4689 : &failure_change, 0LL) != pcmk_rc_ok)
4690 0 : || (failure_change < this_change)) {
4691 0 : return false; // Failure is not known to be newer
4692 : }
4693 :
4694 0 : return true;
4695 : }
4696 :
4697 : /*!
4698 : * \internal
4699 : * \brief Update a resource's role etc. for a pending action
4700 : *
4701 : * \param[in,out] history Parsed history entry for pending action
4702 : * \param[in] last_failure Resource's last_failure entry, if known
4703 : */
4704 : static void
4705 0 : process_pending_action(struct action_history *history,
4706 : const xmlNode *last_failure)
4707 : {
4708 : /* For recurring monitors, a failure is recorded only in RSC_last_failure_0,
4709 : * and there might be a RSC_monitor_INTERVAL entry with the last successful
4710 : * or pending result.
4711 : *
4712 : * If last_failure contains the failure of the pending recurring monitor
4713 : * we're processing here, and is newer, the action is no longer pending.
4714 : * (Pending results have call ID -1, which sorts last, so the last failure
4715 : * if any should be known.)
4716 : */
4717 0 : if (failure_is_newer(history, last_failure)) {
4718 0 : return;
4719 : }
4720 :
4721 0 : if (strcmp(history->task, PCMK_ACTION_START) == 0) {
4722 0 : pcmk__set_rsc_flags(history->rsc, pcmk_rsc_start_pending);
4723 0 : set_active(history->rsc);
4724 :
4725 0 : } else if (strcmp(history->task, PCMK_ACTION_PROMOTE) == 0) {
4726 0 : history->rsc->role = pcmk_role_promoted;
4727 :
4728 0 : } else if ((strcmp(history->task, PCMK_ACTION_MIGRATE_TO) == 0)
4729 0 : && history->node->details->unclean) {
4730 : /* A migrate_to action is pending on a unclean source, so force a stop
4731 : * on the target.
4732 : */
4733 0 : const char *migrate_target = NULL;
4734 0 : pcmk_node_t *target = NULL;
4735 :
4736 0 : migrate_target = crm_element_value(history->xml,
4737 : PCMK__META_MIGRATE_TARGET);
4738 0 : target = pcmk_find_node(history->rsc->cluster, migrate_target);
4739 0 : if (target != NULL) {
4740 0 : stop_action(history->rsc, target, FALSE);
4741 : }
4742 : }
4743 :
4744 0 : if (history->rsc->pending_task != NULL) {
4745 : /* There should never be multiple pending actions, but as a failsafe,
4746 : * just remember the first one processed for display purposes.
4747 : */
4748 0 : return;
4749 : }
4750 :
4751 0 : if (pcmk_is_probe(history->task, history->interval_ms)) {
4752 : /* Pending probes are currently never displayed, even if pending
4753 : * operations are requested. If we ever want to change that,
4754 : * enable the below and the corresponding part of
4755 : * native.c:native_pending_task().
4756 : */
4757 : #if 0
4758 : history->rsc->pending_task = strdup("probe");
4759 : history->rsc->pending_node = history->node;
4760 : #endif
4761 : } else {
4762 0 : history->rsc->pending_task = strdup(history->task);
4763 0 : history->rsc->pending_node = history->node;
4764 : }
4765 : }
4766 :
4767 : static void
4768 0 : unpack_rsc_op(pcmk_resource_t *rsc, pcmk_node_t *node, xmlNode *xml_op,
4769 : xmlNode **last_failure, enum action_fail_response *on_fail)
4770 : {
4771 0 : int old_rc = 0;
4772 0 : bool expired = false;
4773 0 : pcmk_resource_t *parent = rsc;
4774 0 : enum rsc_role_e fail_role = pcmk_role_unknown;
4775 0 : enum action_fail_response failure_strategy = pcmk_on_fail_restart;
4776 :
4777 0 : struct action_history history = {
4778 : .rsc = rsc,
4779 : .node = node,
4780 : .xml = xml_op,
4781 : .execution_status = PCMK_EXEC_UNKNOWN,
4782 : };
4783 :
4784 0 : CRM_CHECK(rsc && node && xml_op, return);
4785 :
4786 0 : history.id = pcmk__xe_id(xml_op);
4787 0 : if (history.id == NULL) {
4788 0 : pcmk__config_err("Ignoring resource history entry for %s on %s "
4789 : "without ID", rsc->id, pcmk__node_name(node));
4790 0 : return;
4791 : }
4792 :
4793 : // Task and interval
4794 0 : history.task = crm_element_value(xml_op, PCMK_XA_OPERATION);
4795 0 : if (history.task == NULL) {
4796 0 : pcmk__config_err("Ignoring resource history entry %s for %s on %s "
4797 : "without " PCMK_XA_OPERATION,
4798 : history.id, rsc->id, pcmk__node_name(node));
4799 0 : return;
4800 : }
4801 0 : crm_element_value_ms(xml_op, PCMK_META_INTERVAL, &(history.interval_ms));
4802 0 : if (!can_affect_state(&history)) {
4803 0 : pcmk__rsc_trace(rsc,
4804 : "Ignoring resource history entry %s for %s on %s "
4805 : "with irrelevant action '%s'",
4806 : history.id, rsc->id, pcmk__node_name(node),
4807 : history.task);
4808 0 : return;
4809 : }
4810 :
4811 0 : if (unpack_action_result(&history) != pcmk_rc_ok) {
4812 0 : return; // Error already logged
4813 : }
4814 :
4815 0 : history.expected_exit_status = pe__target_rc_from_xml(xml_op);
4816 0 : history.key = pcmk__xe_history_key(xml_op);
4817 0 : crm_element_value_int(xml_op, PCMK__XA_CALL_ID, &(history.call_id));
4818 :
4819 0 : pcmk__rsc_trace(rsc, "Unpacking %s (%s call %d on %s): %s (%s)",
4820 : history.id, history.task, history.call_id,
4821 : pcmk__node_name(node),
4822 : pcmk_exec_status_str(history.execution_status),
4823 : crm_exit_str(history.exit_status));
4824 :
4825 0 : if (node->details->unclean) {
4826 0 : pcmk__rsc_trace(rsc,
4827 : "%s is running on %s, which is unclean (further action "
4828 : "depends on value of stop's on-fail attribute)",
4829 : rsc->id, pcmk__node_name(node));
4830 : }
4831 :
4832 0 : expired = check_operation_expiry(&history);
4833 0 : old_rc = history.exit_status;
4834 :
4835 0 : remap_operation(&history, on_fail, expired);
4836 :
4837 0 : if (expired && (process_expired_result(&history, old_rc) == pcmk_rc_ok)) {
4838 0 : goto done;
4839 : }
4840 :
4841 0 : if (!pcmk__is_bundled(rsc) && pcmk_xe_mask_probe_failure(xml_op)) {
4842 0 : mask_probe_failure(&history, old_rc, *last_failure, on_fail);
4843 0 : goto done;
4844 : }
4845 :
4846 0 : if (!pcmk_is_set(rsc->flags, pcmk_rsc_unique)) {
4847 0 : parent = uber_parent(rsc);
4848 : }
4849 :
4850 0 : switch (history.execution_status) {
4851 0 : case PCMK_EXEC_PENDING:
4852 0 : process_pending_action(&history, *last_failure);
4853 0 : goto done;
4854 :
4855 0 : case PCMK_EXEC_DONE:
4856 0 : update_resource_state(&history, history.exit_status, *last_failure,
4857 : on_fail);
4858 0 : goto done;
4859 :
4860 0 : case PCMK_EXEC_NOT_INSTALLED:
4861 0 : unpack_failure_handling(&history, &failure_strategy, &fail_role);
4862 0 : if (failure_strategy == pcmk_on_fail_ignore) {
4863 0 : crm_warn("Cannot ignore failed %s of %s on %s: "
4864 : "Resource agent doesn't exist "
4865 : CRM_XS " status=%d rc=%d id=%s",
4866 : history.task, rsc->id, pcmk__node_name(node),
4867 : history.execution_status, history.exit_status,
4868 : history.id);
4869 : /* Also for printing it as "FAILED" by marking it as
4870 : * pcmk_rsc_failed later
4871 : */
4872 0 : *on_fail = pcmk_on_fail_ban;
4873 : }
4874 0 : resource_location(parent, node, -PCMK_SCORE_INFINITY,
4875 : "hard-error", rsc->cluster);
4876 0 : unpack_rsc_op_failure(&history, failure_strategy, fail_role,
4877 : last_failure, on_fail);
4878 0 : goto done;
4879 :
4880 0 : case PCMK_EXEC_NOT_CONNECTED:
4881 0 : if (pcmk__is_pacemaker_remote_node(node)
4882 0 : && pcmk_is_set(node->details->remote_rsc->flags,
4883 : pcmk_rsc_managed)) {
4884 : /* We should never get into a situation where a managed remote
4885 : * connection resource is considered OK but a resource action
4886 : * behind the connection gets a "not connected" status. But as a
4887 : * fail-safe in case a bug or unusual circumstances do lead to
4888 : * that, ensure the remote connection is considered failed.
4889 : */
4890 0 : pcmk__set_rsc_flags(node->details->remote_rsc,
4891 : pcmk_rsc_failed|pcmk_rsc_stop_if_failed);
4892 : }
4893 0 : break; // Not done, do error handling
4894 :
4895 0 : case PCMK_EXEC_ERROR:
4896 : case PCMK_EXEC_ERROR_HARD:
4897 : case PCMK_EXEC_ERROR_FATAL:
4898 : case PCMK_EXEC_TIMEOUT:
4899 : case PCMK_EXEC_NOT_SUPPORTED:
4900 : case PCMK_EXEC_INVALID:
4901 0 : break; // Not done, do error handling
4902 :
4903 0 : default: // No other value should be possible at this point
4904 0 : break;
4905 : }
4906 :
4907 0 : unpack_failure_handling(&history, &failure_strategy, &fail_role);
4908 0 : if ((failure_strategy == pcmk_on_fail_ignore)
4909 0 : || ((failure_strategy == pcmk_on_fail_restart_container)
4910 0 : && (strcmp(history.task, PCMK_ACTION_STOP) == 0))) {
4911 :
4912 0 : char *last_change_s = last_change_str(xml_op);
4913 :
4914 0 : crm_warn("Pretending failed %s (%s%s%s) of %s on %s at %s succeeded "
4915 : CRM_XS " %s",
4916 : history.task, services_ocf_exitcode_str(history.exit_status),
4917 : (pcmk__str_empty(history.exit_reason)? "" : ": "),
4918 : pcmk__s(history.exit_reason, ""), rsc->id,
4919 : pcmk__node_name(node), last_change_s, history.id);
4920 0 : free(last_change_s);
4921 :
4922 0 : update_resource_state(&history, history.expected_exit_status,
4923 : *last_failure, on_fail);
4924 0 : crm_xml_add(xml_op, PCMK_XA_UNAME, node->details->uname);
4925 0 : pcmk__set_rsc_flags(rsc, pcmk_rsc_ignore_failure);
4926 :
4927 0 : record_failed_op(&history);
4928 :
4929 0 : if ((failure_strategy == pcmk_on_fail_restart_container)
4930 0 : && cmp_on_fail(*on_fail, pcmk_on_fail_restart) <= 0) {
4931 0 : *on_fail = failure_strategy;
4932 : }
4933 :
4934 : } else {
4935 0 : unpack_rsc_op_failure(&history, failure_strategy, fail_role,
4936 : last_failure, on_fail);
4937 :
4938 0 : if (history.execution_status == PCMK_EXEC_ERROR_HARD) {
4939 0 : uint8_t log_level = LOG_ERR;
4940 :
4941 0 : if (history.exit_status == PCMK_OCF_NOT_INSTALLED) {
4942 0 : log_level = LOG_NOTICE;
4943 : }
4944 0 : do_crm_log(log_level,
4945 : "Preventing %s from restarting on %s because "
4946 : "of hard failure (%s%s%s) " CRM_XS " %s",
4947 : parent->id, pcmk__node_name(node),
4948 : services_ocf_exitcode_str(history.exit_status),
4949 : (pcmk__str_empty(history.exit_reason)? "" : ": "),
4950 : pcmk__s(history.exit_reason, ""), history.id);
4951 0 : resource_location(parent, node, -PCMK_SCORE_INFINITY,
4952 : "hard-error", rsc->cluster);
4953 :
4954 0 : } else if (history.execution_status == PCMK_EXEC_ERROR_FATAL) {
4955 0 : pcmk__sched_err("Preventing %s from restarting anywhere because "
4956 : "of fatal failure (%s%s%s) " CRM_XS " %s",
4957 : parent->id,
4958 : services_ocf_exitcode_str(history.exit_status),
4959 : (pcmk__str_empty(history.exit_reason)? "" : ": "),
4960 : pcmk__s(history.exit_reason, ""), history.id);
4961 0 : resource_location(parent, NULL, -PCMK_SCORE_INFINITY,
4962 : "fatal-error", rsc->cluster);
4963 : }
4964 : }
4965 :
4966 0 : done:
4967 0 : pcmk__rsc_trace(rsc, "%s role on %s after %s is %s (next %s)",
4968 : rsc->id, pcmk__node_name(node), history.id,
4969 : pcmk_role_text(rsc->role),
4970 : pcmk_role_text(rsc->next_role));
4971 : }
4972 :
4973 : static void
4974 0 : add_node_attrs(const xmlNode *xml_obj, pcmk_node_t *node, bool overwrite,
4975 : pcmk_scheduler_t *scheduler)
4976 : {
4977 0 : const char *cluster_name = NULL;
4978 :
4979 0 : pe_rule_eval_data_t rule_data = {
4980 : .node_hash = NULL,
4981 0 : .now = scheduler->now,
4982 : .match_data = NULL,
4983 : .rsc_data = NULL,
4984 : .op_data = NULL
4985 : };
4986 :
4987 0 : pcmk__insert_dup(node->details->attrs,
4988 0 : CRM_ATTR_UNAME, node->details->uname);
4989 :
4990 0 : pcmk__insert_dup(node->details->attrs, CRM_ATTR_ID, node->details->id);
4991 0 : if (pcmk__str_eq(node->details->id, scheduler->dc_uuid, pcmk__str_casei)) {
4992 0 : scheduler->dc_node = node;
4993 0 : node->details->is_dc = TRUE;
4994 0 : pcmk__insert_dup(node->details->attrs,
4995 : CRM_ATTR_IS_DC, PCMK_VALUE_TRUE);
4996 : } else {
4997 0 : pcmk__insert_dup(node->details->attrs,
4998 : CRM_ATTR_IS_DC, PCMK_VALUE_FALSE);
4999 : }
5000 :
5001 0 : cluster_name = g_hash_table_lookup(scheduler->config_hash,
5002 : PCMK_OPT_CLUSTER_NAME);
5003 0 : if (cluster_name) {
5004 0 : pcmk__insert_dup(node->details->attrs, CRM_ATTR_CLUSTER_NAME,
5005 : cluster_name);
5006 : }
5007 :
5008 0 : pe__unpack_dataset_nvpairs(xml_obj, PCMK_XE_INSTANCE_ATTRIBUTES, &rule_data,
5009 0 : node->details->attrs, NULL, overwrite,
5010 : scheduler);
5011 :
5012 0 : pe__unpack_dataset_nvpairs(xml_obj, PCMK_XE_UTILIZATION, &rule_data,
5013 0 : node->details->utilization, NULL,
5014 : FALSE, scheduler);
5015 :
5016 0 : if (pcmk__node_attr(node, CRM_ATTR_SITE_NAME, NULL,
5017 : pcmk__rsc_node_current) == NULL) {
5018 0 : const char *site_name = pcmk__node_attr(node, "site-name", NULL,
5019 : pcmk__rsc_node_current);
5020 :
5021 0 : if (site_name) {
5022 0 : pcmk__insert_dup(node->details->attrs,
5023 : CRM_ATTR_SITE_NAME, site_name);
5024 :
5025 0 : } else if (cluster_name) {
5026 : /* Default to cluster-name if unset */
5027 0 : pcmk__insert_dup(node->details->attrs,
5028 : CRM_ATTR_SITE_NAME, cluster_name);
5029 : }
5030 : }
5031 0 : }
5032 :
5033 : static GList *
5034 0 : extract_operations(const char *node, const char *rsc, xmlNode * rsc_entry, gboolean active_filter)
5035 : {
5036 0 : int counter = -1;
5037 0 : int stop_index = -1;
5038 0 : int start_index = -1;
5039 :
5040 0 : xmlNode *rsc_op = NULL;
5041 :
5042 0 : GList *gIter = NULL;
5043 0 : GList *op_list = NULL;
5044 0 : GList *sorted_op_list = NULL;
5045 :
5046 : /* extract operations */
5047 0 : op_list = NULL;
5048 0 : sorted_op_list = NULL;
5049 :
5050 0 : for (rsc_op = pcmk__xe_first_child(rsc_entry, NULL, NULL, NULL);
5051 0 : rsc_op != NULL; rsc_op = pcmk__xe_next(rsc_op)) {
5052 :
5053 0 : if (pcmk__xe_is(rsc_op, PCMK__XE_LRM_RSC_OP)) {
5054 0 : crm_xml_add(rsc_op, PCMK_XA_RESOURCE, rsc);
5055 0 : crm_xml_add(rsc_op, PCMK_XA_UNAME, node);
5056 0 : op_list = g_list_prepend(op_list, rsc_op);
5057 : }
5058 : }
5059 :
5060 0 : if (op_list == NULL) {
5061 : /* if there are no operations, there is nothing to do */
5062 0 : return NULL;
5063 : }
5064 :
5065 0 : sorted_op_list = g_list_sort(op_list, sort_op_by_callid);
5066 :
5067 : /* create active recurring operations as optional */
5068 0 : if (active_filter == FALSE) {
5069 0 : return sorted_op_list;
5070 : }
5071 :
5072 0 : op_list = NULL;
5073 :
5074 0 : calculate_active_ops(sorted_op_list, &start_index, &stop_index);
5075 :
5076 0 : for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) {
5077 0 : xmlNode *rsc_op = (xmlNode *) gIter->data;
5078 :
5079 0 : counter++;
5080 :
5081 0 : if (start_index < stop_index) {
5082 0 : crm_trace("Skipping %s: not active", pcmk__xe_id(rsc_entry));
5083 0 : break;
5084 :
5085 0 : } else if (counter < start_index) {
5086 0 : crm_trace("Skipping %s: old", pcmk__xe_id(rsc_op));
5087 0 : continue;
5088 : }
5089 0 : op_list = g_list_append(op_list, rsc_op);
5090 : }
5091 :
5092 0 : g_list_free(sorted_op_list);
5093 0 : return op_list;
5094 : }
5095 :
5096 : GList *
5097 0 : find_operations(const char *rsc, const char *node, gboolean active_filter,
5098 : pcmk_scheduler_t *scheduler)
5099 : {
5100 0 : GList *output = NULL;
5101 0 : GList *intermediate = NULL;
5102 :
5103 0 : xmlNode *tmp = NULL;
5104 0 : xmlNode *status = pcmk__xe_first_child(scheduler->input, PCMK_XE_STATUS,
5105 : NULL, NULL);
5106 :
5107 0 : pcmk_node_t *this_node = NULL;
5108 :
5109 0 : xmlNode *node_state = NULL;
5110 :
5111 0 : CRM_CHECK(status != NULL, return NULL);
5112 :
5113 0 : for (node_state = pcmk__xe_first_child(status, NULL, NULL, NULL);
5114 0 : node_state != NULL; node_state = pcmk__xe_next(node_state)) {
5115 :
5116 0 : if (pcmk__xe_is(node_state, PCMK__XE_NODE_STATE)) {
5117 0 : const char *uname = crm_element_value(node_state, PCMK_XA_UNAME);
5118 :
5119 0 : if (node != NULL && !pcmk__str_eq(uname, node, pcmk__str_casei)) {
5120 0 : continue;
5121 : }
5122 :
5123 0 : this_node = pcmk_find_node(scheduler, uname);
5124 0 : if(this_node == NULL) {
5125 0 : CRM_LOG_ASSERT(this_node != NULL);
5126 0 : continue;
5127 :
5128 0 : } else if (pcmk__is_pacemaker_remote_node(this_node)) {
5129 0 : determine_remote_online_status(scheduler, this_node);
5130 :
5131 : } else {
5132 0 : determine_online_status(node_state, this_node, scheduler);
5133 : }
5134 :
5135 0 : if (this_node->details->online
5136 0 : || pcmk_is_set(scheduler->flags, pcmk_sched_fencing_enabled)) {
5137 : /* offline nodes run no resources...
5138 : * unless stonith is enabled in which case we need to
5139 : * make sure rsc start events happen after the stonith
5140 : */
5141 0 : xmlNode *lrm_rsc = NULL;
5142 :
5143 0 : tmp = pcmk__xe_first_child(node_state, PCMK__XE_LRM, NULL,
5144 : NULL);
5145 0 : tmp = pcmk__xe_first_child(tmp, PCMK__XE_LRM_RESOURCES, NULL,
5146 : NULL);
5147 :
5148 0 : for (lrm_rsc = pcmk__xe_first_child(tmp, NULL, NULL, NULL);
5149 0 : lrm_rsc != NULL; lrm_rsc = pcmk__xe_next(lrm_rsc)) {
5150 :
5151 0 : if (pcmk__xe_is(lrm_rsc, PCMK__XE_LRM_RESOURCE)) {
5152 0 : const char *rsc_id = crm_element_value(lrm_rsc,
5153 : PCMK_XA_ID);
5154 :
5155 0 : if (rsc != NULL && !pcmk__str_eq(rsc_id, rsc, pcmk__str_casei)) {
5156 0 : continue;
5157 : }
5158 :
5159 0 : intermediate = extract_operations(uname, rsc_id, lrm_rsc, active_filter);
5160 0 : output = g_list_concat(output, intermediate);
5161 : }
5162 : }
5163 : }
5164 : }
5165 : }
5166 :
5167 0 : return output;
5168 : }
|