Handle nodepool allocation failure

When a request is either fulfilled or failed, pass it through to
the scheduler which will accept the request (which means deleting
it in the case of a failure) and pass it on to the pipeline manager
which will set the result of the requesting job to NODE_FAILURE
and cause any sub-jobs to be SKIPPED.

Adjust the request algorithm to only request nodes for jobs that
are ready to run.  The current behavior requests all jobs for a
build set asap, but that has two downsides: it may request and
return nodes more aggressively than necessary (if you have chosen
to create a job tree, you *probably* don't want to tie up nodes
until they are actually needed).  However, that's a grey area,
and we may want to adjust or make that behavior configurable later.
More pressing here is that it makes the logic of when to return
nodes *very* complicated (since SKIPPED jobs are represented by
fake builds, there is no good opportunity to return their nodes).

This seems like a good solution for now, and if we want to make
the node request behavior more aggressive in the future, we can
work out a better model for knowing when to return nodes.

Change-Id: Ideab6eb5794a01d5c2b70cb87d02d61bb3d41cce
diff --git a/tests/base.py b/tests/base.py
index 56c83f2..9e3c07b 100755
--- a/tests/base.py
+++ b/tests/base.py
@@ -887,6 +887,7 @@
         self.thread = threading.Thread(target=self.run)
         self.thread.daemon = True
         self.thread.start()
+        self.fail_requests = set()
 
     def stop(self):
         self._running = False
@@ -965,21 +966,27 @@
         nodeid = path.split("/")[-1]
         return nodeid
 
+    def addFailRequest(self, request):
+        self.fail_requests.add(request['_oid'])
+
     def fulfillRequest(self, request):
-        if request['state'] == 'fulfilled':
+        if request['state'] != 'requested':
             return
         request = request.copy()
         oid = request['_oid']
         del request['_oid']
 
-        nodes = []
-        for node in request['node_types']:
-            nodeid = self.makeNode(oid, node)
-            nodes.append(nodeid)
+        if oid in self.fail_requests:
+            request['state'] = 'failed'
+        else:
+            request['state'] = 'fulfilled'
+            nodes = []
+            for node in request['node_types']:
+                nodeid = self.makeNode(oid, node)
+                nodes.append(nodeid)
+            request['nodes'] = nodes
 
-        request['state'] = 'fulfilled'
         request['state_time'] = time.time()
-        request['nodes'] = nodes
         path = self.REQUEST_ROOT + '/' + oid
         data = json.dumps(request)
         self.log.debug("Fulfilling node request: %s %s" % (oid, data))
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 81635e0..f65dbce 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -4549,6 +4549,27 @@
         self.assertEqual(A.data['status'], 'MERGED')
         self.assertEqual(A.reported, 2)
 
+    def test_nodepool_failure(self):
+        "Test that jobs are reported after a nodepool failure"
+
+        self.fake_nodepool.paused = True
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        A.addApproval('code-review', 2)
+        self.fake_gerrit.addEvent(A.addApproval('approved', 1))
+        self.waitUntilSettled()
+
+        req = self.fake_nodepool.getNodeRequests()[0]
+        self.fake_nodepool.addFailRequest(req)
+
+        self.fake_nodepool.paused = False
+        self.waitUntilSettled()
+
+        self.assertEqual(A.data['status'], 'NEW')
+        self.assertEqual(A.reported, 2)
+        self.assertIn('project-merge : NODE_FAILURE', A.messages[1])
+        self.assertIn('project-test1 : SKIPPED', A.messages[1])
+        self.assertIn('project-test2 : SKIPPED', A.messages[1])
+
 
 class TestDuplicatePipeline(ZuulTestCase):
     tenant_config_file = 'config/duplicate-pipeline/main.yaml'