blob: a65f4d100598369fcd96081239f2b6393377a27a [file] [log] [blame]
Tomáš Pecka98ad18d2020-11-13 15:39:55 +01001/*
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +02002 * Copyright (C) 2020-2023 CESNET, https://photonics.cesnet.cz/
Tomáš Pecka98ad18d2020-11-13 15:39:55 +01003 *
4 * Written by Tomáš Pecka <tomas.pecka@fit.cvut.cz>
5 *
6 */
7
Tomáš Pecka5a4c0352023-12-12 12:29:28 +01008#include <boost/algorithm/string.hpp>
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +02009#include <regex>
10#include <sysrepo-cpp/Connection.hpp>
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010011#include "Sysrepo.h"
Tomáš Pecka2117ce52023-05-12 11:28:34 +020012#include "utils/alarms.h"
Jan Kundrátb98b4f92024-10-17 17:39:24 +020013#include "utils/benchmark.h"
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010014#include "utils/log.h"
Tomáš Peckaba2dc312021-01-23 22:29:11 +010015#include "utils/sysrepo.h"
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010016
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010017namespace {
18
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020019const auto ALARM_CLEARED = "cleared";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020020const auto ALARM_SENSOR_MISSING = "velia-alarms:sensor-missing-alarm";
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020021const auto ALARM_MISSING_SEVERITY = "warning";
22const auto ALARM_MISSING_DESCRIPTION = "Sensor value not reported. Maybe the sensor was unplugged?";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020023const auto ALARM_THRESHOLD_CROSSING_LOW = "velia-alarms:sensor-low-value-alarm";
Tomáš Pecka61212852024-09-30 14:36:50 +020024const auto ALARM_THRESHOLD_CROSSING_LOW_DESCRIPTION = "Sensor value crossed low threshold ({} < {}).";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020025const auto ALARM_THRESHOLD_CROSSING_HIGH = "velia-alarms:sensor-high-value-alarm";
Tomáš Pecka61212852024-09-30 14:36:50 +020026const auto ALARM_THRESHOLD_CROSSING_HIGH_DESCRIPTION = "Sensor value crossed high threshold ({} > {}).";
27const auto ALARM_THRESHOLD_OK = "Sensor value is within normal parameters.";
Tomáš Pecka5a4c0352023-12-12 12:29:28 +010028const auto ALARM_SENSOR_NONOPERATIONAL = "velia-alarms:sensor-nonoperational";
29const auto ALARM_SENSOR_NONOPERATIONAL_SEVERITY = "warning";
30const auto ALARM_SENSOR_NONOPERATIONAL_DESCRIPTION = "Sensor is nonoperational. The values it reports may not be relevant.";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020031
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020032/** @brief Extracts component path prefix from an XPath under /ietf-hardware/component node
33 *
34 * Example input: /ietf-hardware:hardware/component[name='ne:psu:child']/oper-state/disabled
35 * Example output: /ietf-hardware:hardware/component[name='ne:psu:child']
36 */
37std::string extractComponentPrefix(const std::string& componentXPath)
38{
39 static const std::regex regex(R"((/ietf-hardware:hardware/component\[name=('|").*?(\2)\]).*)");
40 std::smatch match;
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010041
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020042 if (std::regex_match(componentXPath, match, regex)) {
43 return match.str(1);
44 }
45
46 throw std::logic_error("Invalid xPath provided ('" + componentXPath + "')");
47}
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020048
49void logAlarm(velia::Log logger, const std::string_view sensor, const std::string_view alarm, const std::string_view severity)
50{
51 logger->info("Alarm {}: {} for {}", alarm, severity, sensor);
52}
Tomáš Pecka9af47392023-05-23 14:56:48 +020053
54bool isThresholdCrossingLow(velia::ietf_hardware::State state)
55{
56 return state == velia::ietf_hardware::State::WarningLow || state == velia::ietf_hardware::State::CriticalLow;
57}
58
59bool isThresholdCrossingHigh(velia::ietf_hardware::State state)
60{
61 return state == velia::ietf_hardware::State::WarningHigh || state == velia::ietf_hardware::State::CriticalHigh;
62}
63
64std::string toYangAlarmSeverity(velia::ietf_hardware::State state)
65{
66 switch (state) {
67 case velia::ietf_hardware::State::WarningLow:
68 case velia::ietf_hardware::State::WarningHigh:
69 return "warning";
70 case velia::ietf_hardware::State::CriticalLow:
71 case velia::ietf_hardware::State::CriticalHigh:
72 return "critical";
73 default: {
74 std::ostringstream oss;
75 oss << "No severity associated with sensor threshold State " << state;
76 throw std::logic_error(oss.str());
77 }
78 }
79}
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010080}
81
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020082namespace velia::ietf_hardware::sysrepo {
83
84/** @brief The constructor expects the HardwareState instance which will provide the actual hardware state data and the poll interval */
85Sysrepo::Sysrepo(::sysrepo::Session session, std::shared_ptr<IETFHardware> hwState, std::chrono::microseconds pollInterval)
86 : m_log(spdlog::get("hardware"))
87 , m_pollInterval(std::move(pollInterval))
88 , m_session(std::move(session))
89 , m_hwState(std::move(hwState))
90 , m_quit(false)
Tomáš Pecka2117ce52023-05-12 11:28:34 +020091{
Tomáš Pecka2117ce52023-05-12 11:28:34 +020092 m_pollThread = std::thread([&]() {
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020093 auto conn = m_session.getConnection();
94
95 DataTree prevValues;
Tomáš Peckac0991ce2023-12-20 15:46:03 +010096 std::set<std::string> seenSensors;
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020097 std::map<std::string, State> thresholdsStates;
Tomáš Pecka26b38212024-01-16 17:23:31 +010098 std::set<std::pair<std::string, std::string>> activeSideLoadedAlarms;
Tomáš Pecka11b49b82024-10-21 14:28:30 +020099 std::set<std::pair<std::string, std::string>> seenSideLoadedAlarms;
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200100
Tomáš Pecka9bd61272024-01-31 15:08:13 +0100101 alarms::pushInventory(
102 m_session,
103 {
104 {ALARM_THRESHOLD_CROSSING_LOW, "Sensor value is below the low threshold."},
105 {ALARM_THRESHOLD_CROSSING_HIGH, "Sensor value is above the high threshold."},
106 {ALARM_SENSOR_MISSING, "Sensor is missing."},
107 {ALARM_SENSOR_NONOPERATIONAL, "Sensor is flagged as nonoperational."},
108 });
Tomáš Pecka2848fd02024-01-30 12:05:59 +0100109
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200110 while (!m_quit) {
Jan Kundrátb98b4f92024-10-17 17:39:24 +0200111 auto benchmark = std::make_optional<velia::utils::MeasureTime>("ietf-hardware/poll");
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200112 m_log->trace("IetfHardware poll");
113
Tomáš Pecka26b38212024-01-16 17:23:31 +0100114 auto [hwStateValues, thresholds, activeSensors, sideLoadedAlarms] = m_hwState->process();
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200115 std::set<std::string> deletedComponents;
Tomáš Pecka87844292024-01-30 15:20:21 +0100116 std::vector<std::string> newSensors;
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200117
Tomáš Peckac0991ce2023-12-20 15:46:03 +0100118 for (const auto& sensorXPath : activeSensors) {
119 if (!seenSensors.contains(sensorXPath)) {
Tomáš Pecka87844292024-01-30 15:20:21 +0100120 newSensors.emplace_back(extractComponentPrefix(sensorXPath));
Tomáš Peckac0991ce2023-12-20 15:46:03 +0100121 }
122 }
123 seenSensors.merge(activeSensors);
124
Tomáš Pecka87844292024-01-30 15:20:21 +0100125 if (!newSensors.empty()) {
Tomáš Peckabbfc1c32024-01-31 13:58:11 +0100126 alarms::addResourcesToInventory(m_session, {
127 {ALARM_THRESHOLD_CROSSING_LOW, newSensors},
128 {ALARM_THRESHOLD_CROSSING_HIGH, newSensors},
129 {ALARM_SENSOR_MISSING, newSensors},
130 {ALARM_SENSOR_NONOPERATIONAL, newSensors},
131 });
Tomáš Pecka87844292024-01-30 15:20:21 +0100132 }
133
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200134 /* Some data readers can stop returning data in some cases (e.g. ejected PSU).
135 * Prune tree components that were removed before updating to avoid having not current data from previous invocations.
136 */
137 for (const auto& [k, v] : prevValues) {
138 if (!hwStateValues.contains(k)) {
139 deletedComponents.emplace(extractComponentPrefix(k));
140 }
141 }
142
Jan Kundrát498c3f82023-05-24 19:25:48 +0200143 std::vector<std::string> discards;
144 discards.reserve(deletedComponents.size());
145 std::copy(deletedComponents.begin(), deletedComponents.end(), std::back_inserter(discards));
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200146
Jan Kundrát58855c42024-09-25 19:31:56 +0200147 m_log->trace("updating HW state ({} entries)", hwStateValues.size());
Jan Kundrát498c3f82023-05-24 19:25:48 +0200148 utils::valuesPush(hwStateValues, {}, discards, m_session, ::sysrepo::Datastore::Operational);
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200149
Tomáš Pecka26b38212024-01-16 17:23:31 +0100150 /* Publish sideloaded alarms */
151 for (const auto& [alarm, resource, severity, text] : sideLoadedAlarms) {
Tomáš Pecka11b49b82024-10-21 14:28:30 +0200152 // Sideloaded alarms' resources are not registered using the code above, let's register those too
153 if (!seenSideLoadedAlarms.contains({alarm, resource})) {
Jan Kundrát4a48aa42024-10-23 10:22:20 +0200154 alarms::addResourcesToInventory(m_session, {{alarm, {resource}}});
Tomáš Pecka11b49b82024-10-21 14:28:30 +0200155 seenSideLoadedAlarms.insert({alarm, resource});
156 }
Tomáš Pecka26b38212024-01-16 17:23:31 +0100157
158 bool isActive = activeSideLoadedAlarms.contains({alarm, resource});
Tomáš Pecka8f4075b2024-10-22 13:54:48 +0200159 if (isActive && severity == ALARM_CLEARED) {
160 alarms::push(m_session, alarm, resource, ALARM_CLEARED, text);
Tomáš Pecka26b38212024-01-16 17:23:31 +0100161 activeSideLoadedAlarms.erase({alarm, resource});
Tomáš Pecka8f4075b2024-10-22 13:54:48 +0200162 } else if (!isActive && severity != ALARM_CLEARED) {
Tomáš Peckad694bc52024-01-30 09:53:06 +0100163 alarms::push(m_session, alarm, resource, severity, text);
Tomáš Pecka26b38212024-01-16 17:23:31 +0100164 activeSideLoadedAlarms.insert({alarm, resource});
165 }
166 }
167
Tomáš Pecka5a4c0352023-12-12 12:29:28 +0100168 /* Look for nonoperational sensors to set alarms */
169 for (const auto& [leaf, value] : hwStateValues) {
170 if (boost::ends_with(leaf, "/sensor-data/oper-status")) {
171 std::optional<std::string> oldValue;
172
173 if (auto it = prevValues.find(leaf); it != prevValues.end()) {
174 oldValue = it->second;
175 }
176
177 if (value == "nonoperational" && oldValue != "nonoperational") {
Tomáš Peckad694bc52024-01-30 09:53:06 +0100178 alarms::push(m_session, ALARM_SENSOR_NONOPERATIONAL, extractComponentPrefix(leaf), ALARM_SENSOR_NONOPERATIONAL_SEVERITY, ALARM_SENSOR_NONOPERATIONAL_DESCRIPTION);
Tomáš Pecka5a4c0352023-12-12 12:29:28 +0100179 } else if (value == "ok" && oldValue && oldValue != "ok" /* don't call clear-alarm if we see this node for the first time, i.e., oldvalue is nullopt */) {
Tomáš Peckad694bc52024-01-30 09:53:06 +0100180 alarms::push(m_session, ALARM_SENSOR_NONOPERATIONAL, extractComponentPrefix(leaf), ALARM_CLEARED, ALARM_SENSOR_NONOPERATIONAL_DESCRIPTION);
Tomáš Pecka5a4c0352023-12-12 12:29:28 +0100181 }
182 }
183 }
184
Tomáš Pecka61212852024-09-30 14:36:50 +0200185 for (const auto& [sensorXPath, updatedThresholdCrossing] : thresholds) {
186 auto [state, newValue, exceededThresholdValue] = updatedThresholdCrossing;
187
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200188 // missing prevState can be considered as Normal
189 const State prevState = [&, sensorXPath = sensorXPath] {
190 if (auto it = thresholdsStates.find(sensorXPath); it != thresholdsStates.end()) {
191 return it->second;
192 }
193 return State::Normal;
194 }();
195 const auto componentXPath = extractComponentPrefix(sensorXPath);
196
197 if (state == State::NoValue) {
Jan Kundrát2ec37482024-01-11 21:38:20 +0100198 logAlarm(m_log, componentXPath, ALARM_SENSOR_MISSING, ALARM_MISSING_SEVERITY);
Tomáš Peckad694bc52024-01-30 09:53:06 +0100199 alarms::push(m_session, ALARM_SENSOR_MISSING, componentXPath, ALARM_MISSING_SEVERITY, ALARM_MISSING_DESCRIPTION);
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200200 } else if (prevState == State::NoValue) {
Jan Kundrát2ec37482024-01-11 21:38:20 +0100201 logAlarm(m_log, componentXPath, ALARM_SENSOR_MISSING, ALARM_CLEARED);
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200202 /* The alarm message is same for both setting and clearing the alarm. RFC8632 says that it is
203 * "The string used to inform operators about the alarm. This MUST contain enough information for an operator to be able to understand the problem and how to resolve it.",
204 * i.e., from my POV it does not make sense to say something like "cleared" when clearing the alarm as this would not be beneficial for the operator to understand what happened.
205 */
Tomáš Peckad694bc52024-01-30 09:53:06 +0100206 alarms::push(m_session, ALARM_SENSOR_MISSING, componentXPath, ALARM_CLEARED, ALARM_MISSING_DESCRIPTION);
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200207 }
208
Tomáš Pecka9af47392023-05-23 14:56:48 +0200209 /*
210 * We set new threshold alarms first. In case the sensor value transitions from high to low (or low to high) we don't want to lose any active alarm on the resource.
211 *
212 * In case new state corresponds to threshold crossing (wither lower bound or upper bound) we set the alarm.
213 * Since we receive only changes to states it should be sufficient to just check if the new state crossed the threshold.
214 * We shouldn't receive any "no-op" state change (e.g. warning low to warning low) and even if we did receive such change, we would only set the same alarm again.
215 * We can however receive a change from critical threshold to warning threshold (or warning to critical) but that is no problem.
216 * We only need to set the same alarm again with the new severity.
217 */
218 if (isThresholdCrossingLow(state)) {
219 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_LOW, toYangAlarmSeverity(state));
Tomáš Pecka61212852024-09-30 14:36:50 +0200220 alarms::push(m_session, ALARM_THRESHOLD_CROSSING_LOW, componentXPath, toYangAlarmSeverity(state),
221 fmt::format(fmt::runtime(ALARM_THRESHOLD_CROSSING_LOW_DESCRIPTION), *newValue, *exceededThresholdValue));
Tomáš Pecka9af47392023-05-23 14:56:48 +0200222 } else if (isThresholdCrossingHigh(state)) {
223 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_HIGH, toYangAlarmSeverity(state));
Tomáš Pecka61212852024-09-30 14:36:50 +0200224 alarms::push(m_session, ALARM_THRESHOLD_CROSSING_HIGH, componentXPath, toYangAlarmSeverity(state),
225 fmt::format(fmt::runtime(ALARM_THRESHOLD_CROSSING_HIGH_DESCRIPTION), *newValue, *exceededThresholdValue));
Tomáš Pecka9af47392023-05-23 14:56:48 +0200226 }
227
228 /* Now we can clear the old threshold alarms that are no longer active, i.e., we transition away from the CriticalLow/WarningLow or CriticalHigh/WarningHigh. */
229 if (!isThresholdCrossingLow(state) && isThresholdCrossingLow(prevState)) {
230 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_LOW, ALARM_CLEARED);
Tomáš Pecka61212852024-09-30 14:36:50 +0200231 alarms::push(m_session, ALARM_THRESHOLD_CROSSING_LOW, componentXPath, ALARM_CLEARED, ALARM_THRESHOLD_OK);
Tomáš Pecka9af47392023-05-23 14:56:48 +0200232 } else if (!isThresholdCrossingHigh(state) && isThresholdCrossingHigh(prevState)) {
233 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_HIGH, ALARM_CLEARED);
Tomáš Pecka61212852024-09-30 14:36:50 +0200234 alarms::push(m_session, ALARM_THRESHOLD_CROSSING_HIGH, componentXPath, ALARM_CLEARED, ALARM_THRESHOLD_OK);
Tomáš Pecka9af47392023-05-23 14:56:48 +0200235 }
236
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200237 thresholdsStates[sensorXPath] = state;
238 }
239
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200240 prevValues = std::move(hwStateValues);
Jan Kundrátb98b4f92024-10-17 17:39:24 +0200241 benchmark.reset();
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200242 std::this_thread::sleep_for(m_pollInterval);
Tomáš Peckabbfc1c32024-01-31 13:58:11 +0100243 }
Tomáš Pecka2117ce52023-05-12 11:28:34 +0200244 });
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200245}
Václav Kubernát7efd6d52021-11-09 01:31:11 +0100246
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200247Sysrepo::~Sysrepo()
248{
249 m_log->trace("Requesting poll thread stop");
250 m_quit = true;
251 m_pollThread.join();
Tomáš Pecka98ad18d2020-11-13 15:39:55 +0100252}
253}