Add Executor Merger and Ansible execution statsd counters
This adds the following counters:
- zuul.executor.*.phase.setup.<result> (setup task)
- zuul.executor.*.phase.reset.<result> (reset connection task)
- zuul.executor.*.phase.<phase>.<result> (pre/run/post playbooks)
- zuul.executor.*.merger.['SUCCESS','FAILURE'] (merger status)
The data provided by these counters are not very reliable in the sense
that the failures may not be related to the executor itself and is
instead a legitimate issue with the patch or the job it is running.
However, when averaged out, these counters should help us identify if
a particular executor is exhibiting irregular behavior when compared
to regular patterns or other executors.
Change-Id: Ie430f9935dce94f4b90cffee33695e1eb4d1ca7d
diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst
index d43fd03..1c17c28 100644
--- a/doc/source/admin/monitoring.rst
+++ b/doc/source/admin/monitoring.rst
@@ -131,6 +131,14 @@
component of the key will be replaced with the hostname of the
executor.
+ .. stat:: merger.<result>
+ :type: counter
+
+ Incremented to represent the status of a Zuul executor's merger
+ operations. ``<result>`` can be either ``SUCCESS`` or ``FAILURE``.
+ A failed merge operation which would be accounted for as a ``FAILURE``
+ is what ends up being returned by Zuul as a ``MERGER_FAILURE``.
+
.. stat:: builds
:type: counter
@@ -148,6 +156,27 @@
The number of builds currently running on this executor. This
includes starting builds.
+ .. stat:: phase
+
+ Subtree detailing per-phase execution statistics:
+
+ .. stat:: <phase>
+
+ ``<phase>`` represents a phase in the execution of a job.
+ This can be an *internal* phase (such as ``setup`` or ``cleanup``) as
+ well as *job* phases such as ``pre``, ``run`` or ``post``.
+
+ .. stat:: <result>
+ :type: counter
+
+ A counter for each type of result.
+ These results do not, by themselves, determine the status of a build
+ but are indicators of the exit status provided by Ansible for the
+ execution of a particular phase.
+
+ Example of possible counters for each phase are: ``RESULT_NORMAL``,
+ ``RESULT_TIMED_OUT``, ``RESULT_UNREACHABLE``, ``RESULT_ABORTED``.
+
.. stat:: load_average
:type: gauge
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index a2a9b42..a831a53 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -780,8 +780,17 @@
ret = merger.mergeChanges(items, repo_state=repo_state)
if not ret: # merge conflict
result = dict(result='MERGER_FAILURE')
+ if self.executor_server.statsd:
+ base_key = ("zuul.executor.%s.merger" %
+ self.executor_server.hostname)
+ self.executor_server.statsd.incr(base_key + ".FAILURE")
self.job.sendWorkComplete(json.dumps(result))
return False
+
+ if self.executor_server.statsd:
+ base_key = ("zuul.executor.%s.merger" %
+ self.executor_server.hostname)
+ self.executor_server.statsd.incr(base_key + ".SUCCESS")
recent = ret[3]
for key, commit in recent.items():
(connection, project, branch) = key
@@ -1465,6 +1474,11 @@
wrapped=False)
self.log.debug("Ansible complete, result %s code %s" % (
self.RESULT_MAP[result], code))
+ if self.executor_server.statsd:
+ base_key = ("zuul.executor.%s.phase.setup" %
+ self.executor_server.hostname)
+ self.executor_server.statsd.incr(base_key + ".%s" %
+ self.RESULT_MAP[result])
return result, code
def runAnsibleCleanup(self, playbook):
@@ -1485,6 +1499,11 @@
wrapped=False)
self.log.debug("Ansible complete, result %s code %s" % (
self.RESULT_MAP[result], code))
+ if self.executor_server.statsd:
+ base_key = ("zuul.executor.%s.phase.cleanup" %
+ self.executor_server.hostname)
+ self.executor_server.statsd.incr(base_key + ".%s" %
+ self.RESULT_MAP[result])
return result, code
def emitPlaybookBanner(self, playbook, step, phase, result=None):
@@ -1554,6 +1573,11 @@
cmd=cmd, timeout=timeout, playbook=playbook)
self.log.debug("Ansible complete, result %s code %s" % (
self.RESULT_MAP[result], code))
+ if self.executor_server.statsd:
+ base_key = ("zuul.executor.%s.phase.%s" %
+ (self.executor_server.hostname, phase or 'unknown'))
+ self.executor_server.statsd.incr(base_key + ".%s" %
+ self.RESULT_MAP[result])
self.emitPlaybookBanner(playbook, 'END', phase, result=result)
return result, code