Handle nodepool allocation failure
When a request is either fulfilled or failed, pass it through to
the scheduler which will accept the request (which means deleting
it in the case of a failure) and pass it on to the pipeline manager
which will set the result of the requesting job to NODE_FAILURE
and cause any sub-jobs to be SKIPPED.
Adjust the request algorithm to only request nodes for jobs that
are ready to run. The current behavior requests all jobs for a
build set asap, but that has two downsides: it may request and
return nodes more aggressively than necessary (if you have chosen
to create a job tree, you *probably* don't want to tie up nodes
until they are actually needed). However, that's a grey area,
and we may want to adjust or make that behavior configurable later.
More pressing here is that it makes the logic of when to return
nodes *very* complicated (since SKIPPED jobs are represented by
fake builds, there is no good opportunity to return their nodes).
This seems like a good solution for now, and if we want to make
the node request behavior more aggressive in the future, we can
work out a better model for knowing when to return nodes.
Change-Id: Ideab6eb5794a01d5c2b70cb87d02d61bb3d41cce
diff --git a/tests/base.py b/tests/base.py
index 56c83f2..9e3c07b 100755
--- a/tests/base.py
+++ b/tests/base.py
@@ -887,6 +887,7 @@
self.thread = threading.Thread(target=self.run)
self.thread.daemon = True
self.thread.start()
+ self.fail_requests = set()
def stop(self):
self._running = False
@@ -965,21 +966,27 @@
nodeid = path.split("/")[-1]
return nodeid
+ def addFailRequest(self, request):
+ self.fail_requests.add(request['_oid'])
+
def fulfillRequest(self, request):
- if request['state'] == 'fulfilled':
+ if request['state'] != 'requested':
return
request = request.copy()
oid = request['_oid']
del request['_oid']
- nodes = []
- for node in request['node_types']:
- nodeid = self.makeNode(oid, node)
- nodes.append(nodeid)
+ if oid in self.fail_requests:
+ request['state'] = 'failed'
+ else:
+ request['state'] = 'fulfilled'
+ nodes = []
+ for node in request['node_types']:
+ nodeid = self.makeNode(oid, node)
+ nodes.append(nodeid)
+ request['nodes'] = nodes
- request['state'] = 'fulfilled'
request['state_time'] = time.time()
- request['nodes'] = nodes
path = self.REQUEST_ROOT + '/' + oid
data = json.dumps(request)
self.log.debug("Fulfilling node request: %s %s" % (oid, data))
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 81635e0..f65dbce 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -4549,6 +4549,27 @@
self.assertEqual(A.data['status'], 'MERGED')
self.assertEqual(A.reported, 2)
+ def test_nodepool_failure(self):
+ "Test that jobs are reported after a nodepool failure"
+
+ self.fake_nodepool.paused = True
+ A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+ A.addApproval('code-review', 2)
+ self.fake_gerrit.addEvent(A.addApproval('approved', 1))
+ self.waitUntilSettled()
+
+ req = self.fake_nodepool.getNodeRequests()[0]
+ self.fake_nodepool.addFailRequest(req)
+
+ self.fake_nodepool.paused = False
+ self.waitUntilSettled()
+
+ self.assertEqual(A.data['status'], 'NEW')
+ self.assertEqual(A.reported, 2)
+ self.assertIn('project-merge : NODE_FAILURE', A.messages[1])
+ self.assertIn('project-test1 : SKIPPED', A.messages[1])
+ self.assertIn('project-test2 : SKIPPED', A.messages[1])
+
class TestDuplicatePipeline(ZuulTestCase):
tenant_config_file = 'config/duplicate-pipeline/main.yaml'