Handle nodepool allocation failure When a request is either fulfilled or failed, pass it through to the scheduler which will accept the request (which means deleting it in the case of a failure) and pass it on to the pipeline manager which will set the result of the requesting job to NODE_FAILURE and cause any sub-jobs to be SKIPPED. Adjust the request algorithm to only request nodes for jobs that are ready to run. The current behavior requests all jobs for a build set asap, but that has two downsides: it may request and return nodes more aggressively than necessary (if you have chosen to create a job tree, you *probably* don't want to tie up nodes until they are actually needed). However, that's a grey area, and we may want to adjust or make that behavior configurable later. More pressing here is that it makes the logic of when to return nodes *very* complicated (since SKIPPED jobs are represented by fake builds, there is no good opportunity to return their nodes). This seems like a good solution for now, and if we want to make the node request behavior more aggressive in the future, we can work out a better model for knowing when to return nodes. Change-Id: Ideab6eb5794a01d5c2b70cb87d02d61bb3d41cce

commit: 6ab79e06377d00028bbc02cb2974499512045cd6 [log] [tgz]
author: James E. Blair <jeblair@redhat.com> Fri Jan 06 10:10:17 2017 -0800
committer: James E. Blair <jeblair@redhat.com> Fri Jan 06 16:47:02 2017 -0800
tree: 58a1e8e001605cff0bc1e04c45a64d0ceadeb375
parent: 803e94f986c03346fab4e4f4d556c2e8af7bc187 [diff]
diff --git a/tests/base.py b/tests/base.py
index 56c83f2..9e3c07b 100755
--- a/tests/base.py
+++ b/tests/base.py

@@ -887,6 +887,7 @@
         self.thread = threading.Thread(target=self.run)
         self.thread.daemon = True
         self.thread.start()
+        self.fail_requests = set()
 
     def stop(self):
         self._running = False
@@ -965,21 +966,27 @@
         nodeid = path.split("/")[-1]
         return nodeid
 
+    def addFailRequest(self, request):
+        self.fail_requests.add(request['_oid'])
+
     def fulfillRequest(self, request):
-        if request['state'] == 'fulfilled':
+        if request['state'] != 'requested':
             return
         request = request.copy()
         oid = request['_oid']
         del request['_oid']
 
-        nodes = []
-        for node in request['node_types']:
-            nodeid = self.makeNode(oid, node)
-            nodes.append(nodeid)
+        if oid in self.fail_requests:
+            request['state'] = 'failed'
+        else:
+            request['state'] = 'fulfilled'
+            nodes = []
+            for node in request['node_types']:
+                nodeid = self.makeNode(oid, node)
+                nodes.append(nodeid)
+            request['nodes'] = nodes
 
-        request['state'] = 'fulfilled'
         request['state_time'] = time.time()
-        request['nodes'] = nodes
         path = self.REQUEST_ROOT + '/' + oid
         data = json.dumps(request)
         self.log.debug("Fulfilling node request: %s %s" % (oid, data))

diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 81635e0..f65dbce 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py

@@ -4549,6 +4549,27 @@
         self.assertEqual(A.data['status'], 'MERGED')
         self.assertEqual(A.reported, 2)
 
+    def test_nodepool_failure(self):
+        "Test that jobs are reported after a nodepool failure"
+
+        self.fake_nodepool.paused = True
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        A.addApproval('code-review', 2)
+        self.fake_gerrit.addEvent(A.addApproval('approved', 1))
+        self.waitUntilSettled()
+
+        req = self.fake_nodepool.getNodeRequests()[0]
+        self.fake_nodepool.addFailRequest(req)
+
+        self.fake_nodepool.paused = False
+        self.waitUntilSettled()
+
+        self.assertEqual(A.data['status'], 'NEW')
+        self.assertEqual(A.reported, 2)
+        self.assertIn('project-merge : NODE_FAILURE', A.messages[1])
+        self.assertIn('project-test1 : SKIPPED', A.messages[1])
+        self.assertIn('project-test2 : SKIPPED', A.messages[1])
+
 
 class TestDuplicatePipeline(ZuulTestCase):
     tenant_config_file = 'config/duplicate-pipeline/main.yaml'
commit	6ab79e06377d00028bbc02cb2974499512045cd6	[log] [tgz]
author	James E. Blair <jeblair@redhat.com>	Fri Jan 06 10:10:17 2017 -0800
committer	James E. Blair <jeblair@redhat.com>	Fri Jan 06 16:47:02 2017 -0800
tree	58a1e8e001605cff0bc1e04c45a64d0ceadeb375
parent	803e94f986c03346fab4e4f4d556c2e8af7bc187 [diff]