blob: 84a0c20375349a31eb4da92877d3ee8cfef941d6 [file] [log] [blame]
Tomáš Pecka98ad18d2020-11-13 15:39:55 +01001/*
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +02002 * Copyright (C) 2020-2023 CESNET, https://photonics.cesnet.cz/
Tomáš Pecka98ad18d2020-11-13 15:39:55 +01003 *
4 * Written by Tomáš Pecka <tomas.pecka@fit.cvut.cz>
5 *
6 */
7
Tomáš Pecka5a4c0352023-12-12 12:29:28 +01008#include <boost/algorithm/string.hpp>
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +02009#include <regex>
10#include <sysrepo-cpp/Connection.hpp>
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010011#include "Sysrepo.h"
Tomáš Pecka2117ce52023-05-12 11:28:34 +020012#include "utils/alarms.h"
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010013#include "utils/log.h"
Tomáš Peckaba2dc312021-01-23 22:29:11 +010014#include "utils/sysrepo.h"
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010015
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010016namespace {
17
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020018const auto ALARM_CLEARED = "cleared";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020019const auto ALARM_SENSOR_MISSING = "velia-alarms:sensor-missing-alarm";
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020020const auto ALARM_MISSING_SEVERITY = "warning";
21const auto ALARM_MISSING_DESCRIPTION = "Sensor value not reported. Maybe the sensor was unplugged?";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020022const auto ALARM_THRESHOLD_CROSSING_LOW = "velia-alarms:sensor-low-value-alarm";
Tomáš Pecka9af47392023-05-23 14:56:48 +020023const auto ALARM_THRESHOLD_CROSSING_LOW_DESCRIPTION = "Sensor value crossed low threshold.";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020024const auto ALARM_THRESHOLD_CROSSING_HIGH = "velia-alarms:sensor-high-value-alarm";
Tomáš Pecka9af47392023-05-23 14:56:48 +020025const auto ALARM_THRESHOLD_CROSSING_HIGH_DESCRIPTION = "Sensor value crossed high threshold.";
Tomáš Pecka5a4c0352023-12-12 12:29:28 +010026const auto ALARM_SENSOR_NONOPERATIONAL = "velia-alarms:sensor-nonoperational";
27const auto ALARM_SENSOR_NONOPERATIONAL_SEVERITY = "warning";
28const auto ALARM_SENSOR_NONOPERATIONAL_DESCRIPTION = "Sensor is nonoperational. The values it reports may not be relevant.";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020029
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020030/** @brief Extracts component path prefix from an XPath under /ietf-hardware/component node
31 *
32 * Example input: /ietf-hardware:hardware/component[name='ne:psu:child']/oper-state/disabled
33 * Example output: /ietf-hardware:hardware/component[name='ne:psu:child']
34 */
35std::string extractComponentPrefix(const std::string& componentXPath)
36{
37 static const std::regex regex(R"((/ietf-hardware:hardware/component\[name=('|").*?(\2)\]).*)");
38 std::smatch match;
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010039
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020040 if (std::regex_match(componentXPath, match, regex)) {
41 return match.str(1);
42 }
43
44 throw std::logic_error("Invalid xPath provided ('" + componentXPath + "')");
45}
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020046
47void logAlarm(velia::Log logger, const std::string_view sensor, const std::string_view alarm, const std::string_view severity)
48{
49 logger->info("Alarm {}: {} for {}", alarm, severity, sensor);
50}
Tomáš Pecka9af47392023-05-23 14:56:48 +020051
52bool isThresholdCrossingLow(velia::ietf_hardware::State state)
53{
54 return state == velia::ietf_hardware::State::WarningLow || state == velia::ietf_hardware::State::CriticalLow;
55}
56
57bool isThresholdCrossingHigh(velia::ietf_hardware::State state)
58{
59 return state == velia::ietf_hardware::State::WarningHigh || state == velia::ietf_hardware::State::CriticalHigh;
60}
61
62std::string toYangAlarmSeverity(velia::ietf_hardware::State state)
63{
64 switch (state) {
65 case velia::ietf_hardware::State::WarningLow:
66 case velia::ietf_hardware::State::WarningHigh:
67 return "warning";
68 case velia::ietf_hardware::State::CriticalLow:
69 case velia::ietf_hardware::State::CriticalHigh:
70 return "critical";
71 default: {
72 std::ostringstream oss;
73 oss << "No severity associated with sensor threshold State " << state;
74 throw std::logic_error(oss.str());
75 }
76 }
77}
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010078}
79
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020080namespace velia::ietf_hardware::sysrepo {
81
82/** @brief The constructor expects the HardwareState instance which will provide the actual hardware state data and the poll interval */
83Sysrepo::Sysrepo(::sysrepo::Session session, std::shared_ptr<IETFHardware> hwState, std::chrono::microseconds pollInterval)
84 : m_log(spdlog::get("hardware"))
85 , m_pollInterval(std::move(pollInterval))
86 , m_session(std::move(session))
87 , m_hwState(std::move(hwState))
88 , m_quit(false)
Tomáš Pecka2117ce52023-05-12 11:28:34 +020089{
Tomáš Pecka2117ce52023-05-12 11:28:34 +020090 m_pollThread = std::thread([&]() {
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020091 auto conn = m_session.getConnection();
92
93 DataTree prevValues;
Tomáš Peckac0991ce2023-12-20 15:46:03 +010094 std::set<std::string> seenSensors;
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020095 std::map<std::string, State> thresholdsStates;
Tomáš Pecka26b38212024-01-16 17:23:31 +010096 std::set<std::pair<std::string, std::string>> activeSideLoadedAlarms;
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020097
Tomáš Pecka9bd61272024-01-31 15:08:13 +010098 alarms::pushInventory(
99 m_session,
100 {
101 {ALARM_THRESHOLD_CROSSING_LOW, "Sensor value is below the low threshold."},
102 {ALARM_THRESHOLD_CROSSING_HIGH, "Sensor value is above the high threshold."},
103 {ALARM_SENSOR_MISSING, "Sensor is missing."},
104 {ALARM_SENSOR_NONOPERATIONAL, "Sensor is flagged as nonoperational."},
105 });
Tomáš Pecka2848fd02024-01-30 12:05:59 +0100106
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200107 while (!m_quit) {
108 m_log->trace("IetfHardware poll");
109
Tomáš Pecka26b38212024-01-16 17:23:31 +0100110 auto [hwStateValues, thresholds, activeSensors, sideLoadedAlarms] = m_hwState->process();
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200111 std::set<std::string> deletedComponents;
Tomáš Pecka87844292024-01-30 15:20:21 +0100112 std::vector<std::string> newSensors;
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200113
Tomáš Peckac0991ce2023-12-20 15:46:03 +0100114 for (const auto& sensorXPath : activeSensors) {
115 if (!seenSensors.contains(sensorXPath)) {
Tomáš Pecka87844292024-01-30 15:20:21 +0100116 newSensors.emplace_back(extractComponentPrefix(sensorXPath));
Tomáš Peckac0991ce2023-12-20 15:46:03 +0100117 }
118 }
119 seenSensors.merge(activeSensors);
120
Tomáš Pecka87844292024-01-30 15:20:21 +0100121 if (!newSensors.empty()) {
Tomáš Peckabbfc1c32024-01-31 13:58:11 +0100122 alarms::addResourcesToInventory(m_session, {
123 {ALARM_THRESHOLD_CROSSING_LOW, newSensors},
124 {ALARM_THRESHOLD_CROSSING_HIGH, newSensors},
125 {ALARM_SENSOR_MISSING, newSensors},
126 {ALARM_SENSOR_NONOPERATIONAL, newSensors},
127 });
Tomáš Pecka87844292024-01-30 15:20:21 +0100128 }
129
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200130 /* Some data readers can stop returning data in some cases (e.g. ejected PSU).
131 * Prune tree components that were removed before updating to avoid having not current data from previous invocations.
132 */
133 for (const auto& [k, v] : prevValues) {
134 if (!hwStateValues.contains(k)) {
135 deletedComponents.emplace(extractComponentPrefix(k));
136 }
137 }
138
Jan Kundrát498c3f82023-05-24 19:25:48 +0200139 std::vector<std::string> discards;
140 discards.reserve(deletedComponents.size());
141 std::copy(deletedComponents.begin(), deletedComponents.end(), std::back_inserter(discards));
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200142
Jan Kundrát58855c42024-09-25 19:31:56 +0200143 m_log->trace("updating HW state ({} entries)", hwStateValues.size());
Jan Kundrát498c3f82023-05-24 19:25:48 +0200144 utils::valuesPush(hwStateValues, {}, discards, m_session, ::sysrepo::Datastore::Operational);
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200145
Tomáš Pecka26b38212024-01-16 17:23:31 +0100146 /* Publish sideloaded alarms */
147 for (const auto& [alarm, resource, severity, text] : sideLoadedAlarms) {
148 // Sideloaded alarms are not registered using the code above, let's register those too
Tomáš Peckabbfc1c32024-01-31 13:58:11 +0100149 alarms::addResourcesToInventory(m_session, {{ALARM_SENSOR_MISSING, {resource}}});
Tomáš Pecka26b38212024-01-16 17:23:31 +0100150
151 bool isActive = activeSideLoadedAlarms.contains({alarm, resource});
152 if (isActive && severity == "cleared") {
Tomáš Peckad694bc52024-01-30 09:53:06 +0100153 alarms::push(m_session, alarm, resource, "cleared", text);
Tomáš Pecka26b38212024-01-16 17:23:31 +0100154 activeSideLoadedAlarms.erase({alarm, resource});
155 } else if (!isActive && severity != "cleared") {
Tomáš Peckad694bc52024-01-30 09:53:06 +0100156 alarms::push(m_session, alarm, resource, severity, text);
Tomáš Pecka26b38212024-01-16 17:23:31 +0100157 activeSideLoadedAlarms.insert({alarm, resource});
158 }
159 }
160
Tomáš Pecka5a4c0352023-12-12 12:29:28 +0100161 /* Look for nonoperational sensors to set alarms */
162 for (const auto& [leaf, value] : hwStateValues) {
163 if (boost::ends_with(leaf, "/sensor-data/oper-status")) {
164 std::optional<std::string> oldValue;
165
166 if (auto it = prevValues.find(leaf); it != prevValues.end()) {
167 oldValue = it->second;
168 }
169
170 if (value == "nonoperational" && oldValue != "nonoperational") {
Tomáš Peckad694bc52024-01-30 09:53:06 +0100171 alarms::push(m_session, ALARM_SENSOR_NONOPERATIONAL, extractComponentPrefix(leaf), ALARM_SENSOR_NONOPERATIONAL_SEVERITY, ALARM_SENSOR_NONOPERATIONAL_DESCRIPTION);
Tomáš Pecka5a4c0352023-12-12 12:29:28 +0100172 } else if (value == "ok" && oldValue && oldValue != "ok" /* don't call clear-alarm if we see this node for the first time, i.e., oldvalue is nullopt */) {
Tomáš Peckad694bc52024-01-30 09:53:06 +0100173 alarms::push(m_session, ALARM_SENSOR_NONOPERATIONAL, extractComponentPrefix(leaf), ALARM_CLEARED, ALARM_SENSOR_NONOPERATIONAL_DESCRIPTION);
Tomáš Pecka5a4c0352023-12-12 12:29:28 +0100174 }
175 }
176 }
177
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200178 for (const auto& [sensorXPath, state] : thresholds) {
179 // missing prevState can be considered as Normal
180 const State prevState = [&, sensorXPath = sensorXPath] {
181 if (auto it = thresholdsStates.find(sensorXPath); it != thresholdsStates.end()) {
182 return it->second;
183 }
184 return State::Normal;
185 }();
186 const auto componentXPath = extractComponentPrefix(sensorXPath);
187
188 if (state == State::NoValue) {
Jan Kundrát2ec37482024-01-11 21:38:20 +0100189 logAlarm(m_log, componentXPath, ALARM_SENSOR_MISSING, ALARM_MISSING_SEVERITY);
Tomáš Peckad694bc52024-01-30 09:53:06 +0100190 alarms::push(m_session, ALARM_SENSOR_MISSING, componentXPath, ALARM_MISSING_SEVERITY, ALARM_MISSING_DESCRIPTION);
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200191 } else if (prevState == State::NoValue) {
Jan Kundrát2ec37482024-01-11 21:38:20 +0100192 logAlarm(m_log, componentXPath, ALARM_SENSOR_MISSING, ALARM_CLEARED);
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200193 /* The alarm message is same for both setting and clearing the alarm. RFC8632 says that it is
194 * "The string used to inform operators about the alarm. This MUST contain enough information for an operator to be able to understand the problem and how to resolve it.",
195 * i.e., from my POV it does not make sense to say something like "cleared" when clearing the alarm as this would not be beneficial for the operator to understand what happened.
196 */
Tomáš Peckad694bc52024-01-30 09:53:06 +0100197 alarms::push(m_session, ALARM_SENSOR_MISSING, componentXPath, ALARM_CLEARED, ALARM_MISSING_DESCRIPTION);
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200198 }
199
Tomáš Pecka9af47392023-05-23 14:56:48 +0200200 /*
201 * We set new threshold alarms first. In case the sensor value transitions from high to low (or low to high) we don't want to lose any active alarm on the resource.
202 *
203 * In case new state corresponds to threshold crossing (wither lower bound or upper bound) we set the alarm.
204 * Since we receive only changes to states it should be sufficient to just check if the new state crossed the threshold.
205 * We shouldn't receive any "no-op" state change (e.g. warning low to warning low) and even if we did receive such change, we would only set the same alarm again.
206 * We can however receive a change from critical threshold to warning threshold (or warning to critical) but that is no problem.
207 * We only need to set the same alarm again with the new severity.
208 */
209 if (isThresholdCrossingLow(state)) {
210 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_LOW, toYangAlarmSeverity(state));
Tomáš Peckad694bc52024-01-30 09:53:06 +0100211 alarms::push(m_session, ALARM_THRESHOLD_CROSSING_LOW, componentXPath, toYangAlarmSeverity(state), ALARM_THRESHOLD_CROSSING_LOW_DESCRIPTION);
Tomáš Pecka9af47392023-05-23 14:56:48 +0200212 } else if (isThresholdCrossingHigh(state)) {
213 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_HIGH, toYangAlarmSeverity(state));
Tomáš Peckad694bc52024-01-30 09:53:06 +0100214 alarms::push(m_session, ALARM_THRESHOLD_CROSSING_HIGH, componentXPath, toYangAlarmSeverity(state), ALARM_THRESHOLD_CROSSING_HIGH_DESCRIPTION);
Tomáš Pecka9af47392023-05-23 14:56:48 +0200215 }
216
217 /* Now we can clear the old threshold alarms that are no longer active, i.e., we transition away from the CriticalLow/WarningLow or CriticalHigh/WarningHigh. */
218 if (!isThresholdCrossingLow(state) && isThresholdCrossingLow(prevState)) {
219 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_LOW, ALARM_CLEARED);
Tomáš Peckad694bc52024-01-30 09:53:06 +0100220 alarms::push(m_session, ALARM_THRESHOLD_CROSSING_LOW, componentXPath, ALARM_CLEARED, ALARM_THRESHOLD_CROSSING_LOW_DESCRIPTION);
Tomáš Pecka9af47392023-05-23 14:56:48 +0200221 } else if (!isThresholdCrossingHigh(state) && isThresholdCrossingHigh(prevState)) {
222 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_HIGH, ALARM_CLEARED);
Tomáš Peckad694bc52024-01-30 09:53:06 +0100223 alarms::push(m_session, ALARM_THRESHOLD_CROSSING_HIGH, componentXPath, ALARM_CLEARED, ALARM_THRESHOLD_CROSSING_HIGH_DESCRIPTION);
Tomáš Pecka9af47392023-05-23 14:56:48 +0200224 }
225
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200226 thresholdsStates[sensorXPath] = state;
227 }
228
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200229 prevValues = std::move(hwStateValues);
230 std::this_thread::sleep_for(m_pollInterval);
Tomáš Peckabbfc1c32024-01-31 13:58:11 +0100231 }
Tomáš Pecka2117ce52023-05-12 11:28:34 +0200232 });
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200233}
Václav Kubernát7efd6d52021-11-09 01:31:11 +0100234
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200235Sysrepo::~Sysrepo()
236{
237 m_log->trace("Requesting poll thread stop");
238 m_quit = true;
239 m_pollThread.join();
Tomáš Pecka98ad18d2020-11-13 15:39:55 +0100240}
241}