Merge "Emit some nodepool stats" into feature/zuulv3
diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst
index dc6be0d..55f1908 100644
--- a/doc/source/admin/monitoring.rst
+++ b/doc/source/admin/monitoring.rst
@@ -146,6 +146,84 @@
The one-minute load average of this executor, multiplied by 100.
+.. stat:: zuul.nodepool
+
+ Holds metrics related to Zuul requests from Nodepool.
+
+ .. stat:: requested
+ :type: counter
+
+ Incremented each time a node request is submitted to Nodepool.
+
+ .. stat:: label.<label>
+ :type: counter
+
+ Incremented each time a request for a specific label is
+ submitted to Nodepool.
+
+ .. stat:: size.<size>
+ :type: counter
+
+ Incremented each time a request of a specific size is submitted
+ to Nodepool. For example, a request for 3 nodes would use the
+ key ``zuul.nodepool.requested.size.3``.
+
+ .. stat:: canceled
+ :type: counter, timer
+
+ The counter is incremented each time a node request is canceled
+ by Zuul. The timer records the elapsed time from request to
+ cancelation.
+
+ .. stat:: label.<label>
+ :type: counter, timer
+
+ The same, for a specific label.
+
+ .. stat:: size.<size>
+ :type: counter, timer
+
+ The same, for a specific request size.
+
+ .. stat:: fulfilled
+ :type: counter, timer
+
+ The counter is incremented each time a node request is fulfilled
+ by Nodepool. The timer records the elapsed time from request to
+ fulfillment.
+
+ .. stat:: label.<label>
+ :type: counter, timer
+
+ The same, for a specific label.
+
+ .. stat:: size.<size>
+ :type: counter, timer
+
+ The same, for a specific request size.
+
+ .. stat:: failed
+ :type: counter, timer
+
+ The counter is incremented each time Nodepool fails to fulfill a
+ node request. The timer records the elapsed time from request
+ to failure.
+
+ .. stat:: label.<label>
+ :type: counter, timer
+
+ The same, for a specific label.
+
+ .. stat:: size.<size>
+ :type: counter, timer
+
+ The same, for a specific request size.
+
+ .. stat:: current_requests
+ :type: gauge
+
+ The number of outstanding nodepool requests from Zuul.
+
As an example, given a job named `myjob` in `mytenant` triggered by a
change to `myproject` on the `master` branch in the `gate` pipeline
diff --git a/tests/unit/test_nodepool.py b/tests/unit/test_nodepool.py
index d51898b..d3f9ddb 100644
--- a/tests/unit/test_nodepool.py
+++ b/tests/unit/test_nodepool.py
@@ -29,6 +29,7 @@
def setUp(self):
super(TestNodepool, self).setUp()
+ self.statsd = None
self.zk_chroot_fixture = self.useFixture(
ChrootedKazooFixture(self.id()))
self.zk_config = '%s:%s%s' % (
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index ab0d0fd..bab5162 100755
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -119,6 +119,14 @@
'org_project.master.total_changes', value='1|c')
exec_key = 'zuul.executor.%s' % self.executor_server.hostname
self.assertReportedStat(exec_key + '.builds', value='1|c')
+ self.assertReportedStat('zuul.nodepool.requested', value='1|c')
+ self.assertReportedStat('zuul.nodepool.requested.label.label1',
+ value='1|c')
+ self.assertReportedStat('zuul.nodepool.fulfilled.label.label1',
+ value='1|c')
+ self.assertReportedStat('zuul.nodepool.requested.size.1', value='1|c')
+ self.assertReportedStat('zuul.nodepool.fulfilled.size.1', value='1|c')
+ self.assertReportedStat('zuul.nodepool.current_requests', value='1|g')
for build in self.history:
self.assertTrue(build.parameters['zuul']['voting'])
diff --git a/zuul/model.py b/zuul/model.py
index 6eebbfb..b7c3031 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -523,6 +523,7 @@
self.job = job
self.nodeset = nodeset
self._state = STATE_REQUESTED
+ self.requested_time = time.time()
self.state_time = time.time()
self.stat = None
self.uid = uuid4().hex
diff --git a/zuul/nodepool.py b/zuul/nodepool.py
index f4c850d..7dafca0 100644
--- a/zuul/nodepool.py
+++ b/zuul/nodepool.py
@@ -22,6 +22,35 @@
self.requests = {}
self.sched = scheduler
+ def emitStats(self, request):
+ if not self.sched.statsd:
+ return
+ statsd = self.sched.statsd
+ # counter zuul.nodepool.requested
+ # counter zuul.nodepool.requested.label.<label>
+ # counter zuul.nodepool.requested.size.<size>
+ # gauge zuul.nodepool.current_requests
+ state = request.state
+ if request.canceled:
+ state = 'canceled'
+ dt = None
+ elif request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
+ dt = int((request.state_time - request.requested_time) * 1000)
+ else:
+ dt = None
+ key = 'zuul.nodepool.%s' % state
+ statsd.incr(key)
+ if dt:
+ statsd.timing(key, dt)
+ for node in request.nodeset.getNodes():
+ statsd.incr(key + '.label.%s' % node.label)
+ if dt:
+ statsd.timing(key + '.label.%s' % node.label, dt)
+ statsd.incr(key + '.size.%s' % len(request.nodeset.nodes))
+ if dt:
+ statsd.timing(key + '.size.%s' % len(request.nodeset.nodes), dt)
+ statsd.gauge('zuul.nodepool.current_requests', len(self.requests))
+
def requestNodes(self, build_set, job):
# Create a copy of the nodeset to represent the actual nodes
# returned by nodepool.
@@ -33,6 +62,7 @@
self.sched.zk.submitNodeRequest(req, self._updateNodeRequest)
# Logged after submission so that we have the request id
self.log.info("Submited node request %s" % (req,))
+ self.emitStats(req)
else:
self.log.info("Fulfilling empty node request %s" % (req,))
req.state = model.STATE_FULFILLED
@@ -142,6 +172,7 @@
if request.canceled:
del self.requests[request.uid]
+ self.emitStats(request)
return False
# TODOv3(jeblair): handle allocation failure
@@ -156,6 +187,8 @@
self.sched.onNodesProvisioned(request)
del self.requests[request.uid]
+ self.emitStats(request)
+
# Stop watching this request node.
return False