Merge "Handle nodepool allocation failure" into feature/zuulv3
diff --git a/tests/base.py b/tests/base.py
index 56c83f2..9e3c07b 100755
--- a/tests/base.py
+++ b/tests/base.py
@@ -887,6 +887,7 @@
         self.thread = threading.Thread(target=self.run)
         self.thread.daemon = True
         self.thread.start()
+        self.fail_requests = set()
 
     def stop(self):
         self._running = False
@@ -965,21 +966,27 @@
         nodeid = path.split("/")[-1]
         return nodeid
 
+    def addFailRequest(self, request):
+        self.fail_requests.add(request['_oid'])
+
     def fulfillRequest(self, request):
-        if request['state'] == 'fulfilled':
+        if request['state'] != 'requested':
             return
         request = request.copy()
         oid = request['_oid']
         del request['_oid']
 
-        nodes = []
-        for node in request['node_types']:
-            nodeid = self.makeNode(oid, node)
-            nodes.append(nodeid)
+        if oid in self.fail_requests:
+            request['state'] = 'failed'
+        else:
+            request['state'] = 'fulfilled'
+            nodes = []
+            for node in request['node_types']:
+                nodeid = self.makeNode(oid, node)
+                nodes.append(nodeid)
+            request['nodes'] = nodes
 
-        request['state'] = 'fulfilled'
         request['state_time'] = time.time()
-        request['nodes'] = nodes
         path = self.REQUEST_ROOT + '/' + oid
         data = json.dumps(request)
         self.log.debug("Fulfilling node request: %s %s" % (oid, data))
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index bee703f..579f7a3 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -4546,6 +4546,27 @@
         self.assertEqual(A.data['status'], 'MERGED')
         self.assertEqual(A.reported, 2)
 
+    def test_nodepool_failure(self):
+        "Test that jobs are reported after a nodepool failure"
+
+        self.fake_nodepool.paused = True
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        A.addApproval('code-review', 2)
+        self.fake_gerrit.addEvent(A.addApproval('approved', 1))
+        self.waitUntilSettled()
+
+        req = self.fake_nodepool.getNodeRequests()[0]
+        self.fake_nodepool.addFailRequest(req)
+
+        self.fake_nodepool.paused = False
+        self.waitUntilSettled()
+
+        self.assertEqual(A.data['status'], 'NEW')
+        self.assertEqual(A.reported, 2)
+        self.assertIn('project-merge : NODE_FAILURE', A.messages[1])
+        self.assertIn('project-test1 : SKIPPED', A.messages[1])
+        self.assertIn('project-test2 : SKIPPED', A.messages[1])
+
 
 class TestDuplicatePipeline(ZuulTestCase):
     tenant_config_file = 'config/duplicate-pipeline/main.yaml'
diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index 0d11316..988a930 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py
@@ -648,6 +648,10 @@
         build_set = request.build_set
         build_set.jobNodeRequestComplete(request.job.name, request,
                                          request.nodeset)
+        if request.failed or not request.fulfilled:
+            self.log.info("Node request failure for %s" %
+                          (request.job.name,))
+            build_set.item.setNodeRequestFailure(request.job)
         self.log.info("Completed node request %s for job %s of item %s "
                       "with nodes %s" %
                       (request, request.job, build_set.item,
diff --git a/zuul/model.py b/zuul/model.py
index d28c7cc..2c3c7b3 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -474,6 +474,10 @@
         self.failed = False
 
     @property
+    def fulfilled(self):
+        return (self._state == STATE_FULFILLED) and not self.failed
+
+    @property
     def state(self):
         return self._state
 
@@ -989,18 +993,28 @@
         return self._findJobsToRun(tree.job_trees, mutex)
 
     def _findJobsToRequest(self, job_trees):
+        build_set = self.current_build_set
         toreq = []
+        if self.item_ahead:
+            if self.item_ahead.isHoldingFollowingChanges():
+                return []
         for tree in job_trees:
             job = tree.job
+            result = None
             if job:
                 if not job.changeMatches(self.change):
                     continue
-                nodeset = self.current_build_set.getJobNodeSet(job.name)
-                if nodeset is None:
-                    req = self.current_build_set.getJobNodeRequest(job.name)
-                    if req is None:
-                        toreq.append(job)
-            toreq.extend(self._findJobsToRequest(tree.job_trees))
+                build = build_set.getBuild(job.name)
+                if build:
+                    result = build.result
+                else:
+                    nodeset = build_set.getJobNodeSet(job.name)
+                    if nodeset is None:
+                        req = build_set.getJobNodeRequest(job.name)
+                        if req is None:
+                            toreq.append(job)
+            if result == 'SUCCESS' or not job:
+                toreq.extend(self._findJobsToRequest(tree.job_trees))
         return toreq
 
     def findJobsToRequest(self):
@@ -1022,6 +1036,12 @@
                 fakebuild.result = 'SKIPPED'
                 self.addBuild(fakebuild)
 
+    def setNodeRequestFailure(self, job):
+        fakebuild = Build(job, None)
+        self.addBuild(fakebuild)
+        fakebuild.result = 'NODE_FAILURE'
+        self.setResult(fakebuild)
+
     def setDequeuedNeedingChange(self):
         self.dequeued_needing_change = True
         self._setAllJobsSkipped()
diff --git a/zuul/nodepool.py b/zuul/nodepool.py
index b7be94f..8c944cc 100644
--- a/zuul/nodepool.py
+++ b/zuul/nodepool.py
@@ -98,8 +98,8 @@
         if request.uid not in self.requests:
             return False
 
-        if request.state == model.STATE_FULFILLED:
-            self.log.info("Node request %s fulfilled" % (request,))
+        if request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
+            self.log.info("Node request %s %s" % (request, request.state))
 
             # Give our results to the scheduler.
             self.sched.onNodesProvisioned(request)
@@ -119,17 +119,18 @@
 
         self.log.info("Accepting node request %s" % (request,))
 
-        # First, try to lock the nodes.
         locked = False
-        try:
-            self.lockNodeset(request.nodeset)
-            locked = True
-        except Exception:
-            self.log.exception("Error locking nodes:")
-            request.failed = True
+        if request.fulfilled:
+            # If the request suceeded, try to lock the nodes.
+            try:
+                self.lockNodeset(request.nodeset)
+                locked = True
+            except Exception:
+                self.log.exception("Error locking nodes:")
+                request.failed = True
 
-        # Regardless of whether locking succeeded, delete the
-        # request.
+        # Regardless of whether locking (or even the request)
+        # succeeded, delete the request.
         self.log.debug("Deleting node request %s" % (request,))
         try:
             self.sched.zk.deleteNodeRequest(request)
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 5f51cbf..5e49f20 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -811,22 +811,19 @@
         request = event.request
         build_set = request.build_set
 
-        try:
-            self.nodepool.acceptNodes(request)
-        except Exception:
-            self.log.exception("Unable to accept nodes from request %s:"
-                               % (request,))
-            return
+        self.nodepool.acceptNodes(request)
 
         if build_set is not build_set.item.current_build_set:
             self.log.warning("Build set %s is not current" % (build_set,))
-            self.nodepool.returnNodeset(request.nodeset)
+            if request.fulfilled:
+                self.nodepool.returnNodeset(request.nodeset)
             return
         pipeline = build_set.item.pipeline
         if not pipeline:
             self.log.warning("Build set %s is not associated with a pipeline" %
                              (build_set,))
-            self.nodepool.returnNodeset(request.nodeset)
+            if request.fulfilled:
+                self.nodepool.returnNodeset(request.nodeset)
             return
         pipeline.manager.onNodesProvisioned(event)