blob: 1a3b0f4dfb241b1d39fb3cdb009fbcd1e09f34a3 [file] [log] [blame]
Tomáš Pecka98ad18d2020-11-13 15:39:55 +01001/*
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +02002 * Copyright (C) 2020-2023 CESNET, https://photonics.cesnet.cz/
Tomáš Pecka98ad18d2020-11-13 15:39:55 +01003 *
4 * Written by Tomáš Pecka <tomas.pecka@fit.cvut.cz>
5 *
6 */
7
Tomáš Pecka5a4c0352023-12-12 12:29:28 +01008#include <boost/algorithm/string.hpp>
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +02009#include <regex>
10#include <sysrepo-cpp/Connection.hpp>
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010011#include "Sysrepo.h"
Tomáš Pecka2117ce52023-05-12 11:28:34 +020012#include "utils/alarms.h"
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010013#include "utils/log.h"
Tomáš Peckaba2dc312021-01-23 22:29:11 +010014#include "utils/sysrepo.h"
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010015
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010016namespace {
17
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020018const auto ALARM_CLEARED = "cleared";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020019const auto ALARM_SENSOR_MISSING = "velia-alarms:sensor-missing-alarm";
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020020const auto ALARM_MISSING = "velia-alarms:sensor-missing-alarm";
21const auto ALARM_MISSING_SEVERITY = "warning";
22const auto ALARM_MISSING_DESCRIPTION = "Sensor value not reported. Maybe the sensor was unplugged?";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020023const auto ALARM_THRESHOLD_CROSSING_LOW = "velia-alarms:sensor-low-value-alarm";
Tomáš Pecka9af47392023-05-23 14:56:48 +020024const auto ALARM_THRESHOLD_CROSSING_LOW_DESCRIPTION = "Sensor value crossed low threshold.";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020025const auto ALARM_THRESHOLD_CROSSING_HIGH = "velia-alarms:sensor-high-value-alarm";
Tomáš Pecka9af47392023-05-23 14:56:48 +020026const auto ALARM_THRESHOLD_CROSSING_HIGH_DESCRIPTION = "Sensor value crossed high threshold.";
Tomáš Pecka5a4c0352023-12-12 12:29:28 +010027const auto ALARM_SENSOR_NONOPERATIONAL = "velia-alarms:sensor-nonoperational";
28const auto ALARM_SENSOR_NONOPERATIONAL_SEVERITY = "warning";
29const auto ALARM_SENSOR_NONOPERATIONAL_DESCRIPTION = "Sensor is nonoperational. The values it reports may not be relevant.";
Tomáš Pecka2117ce52023-05-12 11:28:34 +020030
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020031/** @brief Extracts component path prefix from an XPath under /ietf-hardware/component node
32 *
33 * Example input: /ietf-hardware:hardware/component[name='ne:psu:child']/oper-state/disabled
34 * Example output: /ietf-hardware:hardware/component[name='ne:psu:child']
35 */
36std::string extractComponentPrefix(const std::string& componentXPath)
37{
38 static const std::regex regex(R"((/ietf-hardware:hardware/component\[name=('|").*?(\2)\]).*)");
39 std::smatch match;
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010040
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020041 if (std::regex_match(componentXPath, match, regex)) {
42 return match.str(1);
43 }
44
45 throw std::logic_error("Invalid xPath provided ('" + componentXPath + "')");
46}
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020047
48void logAlarm(velia::Log logger, const std::string_view sensor, const std::string_view alarm, const std::string_view severity)
49{
50 logger->info("Alarm {}: {} for {}", alarm, severity, sensor);
51}
Tomáš Pecka9af47392023-05-23 14:56:48 +020052
53bool isThresholdCrossingLow(velia::ietf_hardware::State state)
54{
55 return state == velia::ietf_hardware::State::WarningLow || state == velia::ietf_hardware::State::CriticalLow;
56}
57
58bool isThresholdCrossingHigh(velia::ietf_hardware::State state)
59{
60 return state == velia::ietf_hardware::State::WarningHigh || state == velia::ietf_hardware::State::CriticalHigh;
61}
62
63std::string toYangAlarmSeverity(velia::ietf_hardware::State state)
64{
65 switch (state) {
66 case velia::ietf_hardware::State::WarningLow:
67 case velia::ietf_hardware::State::WarningHigh:
68 return "warning";
69 case velia::ietf_hardware::State::CriticalLow:
70 case velia::ietf_hardware::State::CriticalHigh:
71 return "critical";
72 default: {
73 std::ostringstream oss;
74 oss << "No severity associated with sensor threshold State " << state;
75 throw std::logic_error(oss.str());
76 }
77 }
78}
Tomáš Pecka98ad18d2020-11-13 15:39:55 +010079}
80
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020081namespace velia::ietf_hardware::sysrepo {
82
83/** @brief The constructor expects the HardwareState instance which will provide the actual hardware state data and the poll interval */
84Sysrepo::Sysrepo(::sysrepo::Session session, std::shared_ptr<IETFHardware> hwState, std::chrono::microseconds pollInterval)
85 : m_log(spdlog::get("hardware"))
86 , m_pollInterval(std::move(pollInterval))
87 , m_session(std::move(session))
88 , m_hwState(std::move(hwState))
89 , m_quit(false)
Tomáš Pecka2117ce52023-05-12 11:28:34 +020090{
Tomáš Pecka2117ce52023-05-12 11:28:34 +020091 m_pollThread = std::thread([&]() {
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020092 auto conn = m_session.getConnection();
93
94 DataTree prevValues;
Tomáš Peckac0991ce2023-12-20 15:46:03 +010095 std::set<std::string> seenSensors;
Tomáš Pecka1b3c1732023-05-12 11:45:01 +020096 std::map<std::string, State> thresholdsStates;
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +020097
98 while (!m_quit) {
99 m_log->trace("IetfHardware poll");
100
Tomáš Peckac0991ce2023-12-20 15:46:03 +0100101 auto [hwStateValues, thresholds, activeSensors] = m_hwState->process();
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200102 std::set<std::string> deletedComponents;
103
Tomáš Peckac0991ce2023-12-20 15:46:03 +0100104 for (const auto& sensorXPath : activeSensors) {
105 if (!seenSensors.contains(sensorXPath)) {
106 auto componentXPath = extractComponentPrefix(sensorXPath);
107 utils::addResourceToAlarmInventoryEntry(m_session, ALARM_THRESHOLD_CROSSING_LOW, std::nullopt, componentXPath);
108 utils::addResourceToAlarmInventoryEntry(m_session, ALARM_THRESHOLD_CROSSING_HIGH, std::nullopt, componentXPath);
109 utils::addResourceToAlarmInventoryEntry(m_session, ALARM_SENSOR_MISSING, std::nullopt, componentXPath);
110 utils::addResourceToAlarmInventoryEntry(m_session, ALARM_SENSOR_NONOPERATIONAL, std::nullopt, componentXPath);
111 }
112 }
113 seenSensors.merge(activeSensors);
114
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200115 /* Some data readers can stop returning data in some cases (e.g. ejected PSU).
116 * Prune tree components that were removed before updating to avoid having not current data from previous invocations.
117 */
118 for (const auto& [k, v] : prevValues) {
119 if (!hwStateValues.contains(k)) {
120 deletedComponents.emplace(extractComponentPrefix(k));
121 }
122 }
123
Jan Kundrát498c3f82023-05-24 19:25:48 +0200124 std::vector<std::string> discards;
125 discards.reserve(deletedComponents.size());
126 std::copy(deletedComponents.begin(), deletedComponents.end(), std::back_inserter(discards));
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200127
Jan Kundrát498c3f82023-05-24 19:25:48 +0200128 utils::valuesPush(hwStateValues, {}, discards, m_session, ::sysrepo::Datastore::Operational);
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200129
Tomáš Pecka5a4c0352023-12-12 12:29:28 +0100130 /* Look for nonoperational sensors to set alarms */
131 for (const auto& [leaf, value] : hwStateValues) {
132 if (boost::ends_with(leaf, "/sensor-data/oper-status")) {
133 std::optional<std::string> oldValue;
134
135 if (auto it = prevValues.find(leaf); it != prevValues.end()) {
136 oldValue = it->second;
137 }
138
139 if (value == "nonoperational" && oldValue != "nonoperational") {
140 utils::createOrUpdateAlarm(m_session, ALARM_SENSOR_NONOPERATIONAL, std::nullopt, extractComponentPrefix(leaf), ALARM_SENSOR_NONOPERATIONAL_SEVERITY, ALARM_SENSOR_NONOPERATIONAL_DESCRIPTION);
141 } else if (value == "ok" && oldValue && oldValue != "ok" /* don't call clear-alarm if we see this node for the first time, i.e., oldvalue is nullopt */) {
142 utils::createOrUpdateAlarm(m_session, ALARM_SENSOR_NONOPERATIONAL, std::nullopt, extractComponentPrefix(leaf), ALARM_CLEARED, ALARM_SENSOR_NONOPERATIONAL_DESCRIPTION);
143 }
144 }
145 }
146
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200147 for (const auto& [sensorXPath, state] : thresholds) {
148 // missing prevState can be considered as Normal
149 const State prevState = [&, sensorXPath = sensorXPath] {
150 if (auto it = thresholdsStates.find(sensorXPath); it != thresholdsStates.end()) {
151 return it->second;
152 }
153 return State::Normal;
154 }();
155 const auto componentXPath = extractComponentPrefix(sensorXPath);
156
157 if (state == State::NoValue) {
158 logAlarm(m_log, componentXPath, ALARM_MISSING, ALARM_MISSING_SEVERITY);
159 utils::createOrUpdateAlarm(m_session, ALARM_MISSING, std::nullopt, componentXPath, ALARM_MISSING_SEVERITY, ALARM_MISSING_DESCRIPTION);
160 } else if (prevState == State::NoValue) {
161 logAlarm(m_log, componentXPath, ALARM_MISSING, ALARM_CLEARED);
162 /* The alarm message is same for both setting and clearing the alarm. RFC8632 says that it is
163 * "The string used to inform operators about the alarm. This MUST contain enough information for an operator to be able to understand the problem and how to resolve it.",
164 * i.e., from my POV it does not make sense to say something like "cleared" when clearing the alarm as this would not be beneficial for the operator to understand what happened.
165 */
166 utils::createOrUpdateAlarm(m_session, ALARM_MISSING, std::nullopt, componentXPath, ALARM_CLEARED, ALARM_MISSING_DESCRIPTION);
167 }
168
Tomáš Pecka9af47392023-05-23 14:56:48 +0200169 /*
170 * We set new threshold alarms first. In case the sensor value transitions from high to low (or low to high) we don't want to lose any active alarm on the resource.
171 *
172 * In case new state corresponds to threshold crossing (wither lower bound or upper bound) we set the alarm.
173 * Since we receive only changes to states it should be sufficient to just check if the new state crossed the threshold.
174 * We shouldn't receive any "no-op" state change (e.g. warning low to warning low) and even if we did receive such change, we would only set the same alarm again.
175 * We can however receive a change from critical threshold to warning threshold (or warning to critical) but that is no problem.
176 * We only need to set the same alarm again with the new severity.
177 */
178 if (isThresholdCrossingLow(state)) {
179 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_LOW, toYangAlarmSeverity(state));
180 utils::createOrUpdateAlarm(m_session, ALARM_THRESHOLD_CROSSING_LOW, std::nullopt, componentXPath, toYangAlarmSeverity(state), ALARM_THRESHOLD_CROSSING_LOW_DESCRIPTION);
181 } else if (isThresholdCrossingHigh(state)) {
182 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_HIGH, toYangAlarmSeverity(state));
183 utils::createOrUpdateAlarm(m_session, ALARM_THRESHOLD_CROSSING_HIGH, std::nullopt, componentXPath, toYangAlarmSeverity(state), ALARM_THRESHOLD_CROSSING_HIGH_DESCRIPTION);
184 }
185
186 /* Now we can clear the old threshold alarms that are no longer active, i.e., we transition away from the CriticalLow/WarningLow or CriticalHigh/WarningHigh. */
187 if (!isThresholdCrossingLow(state) && isThresholdCrossingLow(prevState)) {
188 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_LOW, ALARM_CLEARED);
189 utils::createOrUpdateAlarm(m_session, ALARM_THRESHOLD_CROSSING_LOW, std::nullopt, componentXPath, ALARM_CLEARED, ALARM_THRESHOLD_CROSSING_LOW_DESCRIPTION);
190 } else if (!isThresholdCrossingHigh(state) && isThresholdCrossingHigh(prevState)) {
191 logAlarm(m_log, componentXPath, ALARM_THRESHOLD_CROSSING_HIGH, ALARM_CLEARED);
192 utils::createOrUpdateAlarm(m_session, ALARM_THRESHOLD_CROSSING_HIGH, std::nullopt, componentXPath, ALARM_CLEARED, ALARM_THRESHOLD_CROSSING_HIGH_DESCRIPTION);
193 }
194
Tomáš Pecka1b3c1732023-05-12 11:45:01 +0200195 thresholdsStates[sensorXPath] = state;
196 }
197
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200198 prevValues = std::move(hwStateValues);
199 std::this_thread::sleep_for(m_pollInterval);
200 }
Tomáš Pecka2117ce52023-05-12 11:28:34 +0200201 });
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200202}
Václav Kubernát7efd6d52021-11-09 01:31:11 +0100203
Tomáš Pecka43ef7ba2023-04-13 15:56:48 +0200204Sysrepo::~Sysrepo()
205{
206 m_log->trace("Requesting poll thread stop");
207 m_quit = true;
208 m_pollThread.join();
Tomáš Pecka98ad18d2020-11-13 15:39:55 +0100209}
210}