Merge "Handle nodepool allocation failure" into feature/zuulv3
diff --git a/tests/base.py b/tests/base.py
index 56c83f2..9e3c07b 100755
--- a/tests/base.py
+++ b/tests/base.py
@@ -887,6 +887,7 @@
self.thread = threading.Thread(target=self.run)
self.thread.daemon = True
self.thread.start()
+ self.fail_requests = set()
def stop(self):
self._running = False
@@ -965,21 +966,27 @@
nodeid = path.split("/")[-1]
return nodeid
+ def addFailRequest(self, request):
+ self.fail_requests.add(request['_oid'])
+
def fulfillRequest(self, request):
- if request['state'] == 'fulfilled':
+ if request['state'] != 'requested':
return
request = request.copy()
oid = request['_oid']
del request['_oid']
- nodes = []
- for node in request['node_types']:
- nodeid = self.makeNode(oid, node)
- nodes.append(nodeid)
+ if oid in self.fail_requests:
+ request['state'] = 'failed'
+ else:
+ request['state'] = 'fulfilled'
+ nodes = []
+ for node in request['node_types']:
+ nodeid = self.makeNode(oid, node)
+ nodes.append(nodeid)
+ request['nodes'] = nodes
- request['state'] = 'fulfilled'
request['state_time'] = time.time()
- request['nodes'] = nodes
path = self.REQUEST_ROOT + '/' + oid
data = json.dumps(request)
self.log.debug("Fulfilling node request: %s %s" % (oid, data))
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index bee703f..579f7a3 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -4546,6 +4546,27 @@
self.assertEqual(A.data['status'], 'MERGED')
self.assertEqual(A.reported, 2)
+ def test_nodepool_failure(self):
+ "Test that jobs are reported after a nodepool failure"
+
+ self.fake_nodepool.paused = True
+ A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+ A.addApproval('code-review', 2)
+ self.fake_gerrit.addEvent(A.addApproval('approved', 1))
+ self.waitUntilSettled()
+
+ req = self.fake_nodepool.getNodeRequests()[0]
+ self.fake_nodepool.addFailRequest(req)
+
+ self.fake_nodepool.paused = False
+ self.waitUntilSettled()
+
+ self.assertEqual(A.data['status'], 'NEW')
+ self.assertEqual(A.reported, 2)
+ self.assertIn('project-merge : NODE_FAILURE', A.messages[1])
+ self.assertIn('project-test1 : SKIPPED', A.messages[1])
+ self.assertIn('project-test2 : SKIPPED', A.messages[1])
+
class TestDuplicatePipeline(ZuulTestCase):
tenant_config_file = 'config/duplicate-pipeline/main.yaml'
diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index 0d11316..988a930 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py
@@ -648,6 +648,10 @@
build_set = request.build_set
build_set.jobNodeRequestComplete(request.job.name, request,
request.nodeset)
+ if request.failed or not request.fulfilled:
+ self.log.info("Node request failure for %s" %
+ (request.job.name,))
+ build_set.item.setNodeRequestFailure(request.job)
self.log.info("Completed node request %s for job %s of item %s "
"with nodes %s" %
(request, request.job, build_set.item,
diff --git a/zuul/model.py b/zuul/model.py
index d28c7cc..2c3c7b3 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -474,6 +474,10 @@
self.failed = False
@property
+ def fulfilled(self):
+ return (self._state == STATE_FULFILLED) and not self.failed
+
+ @property
def state(self):
return self._state
@@ -989,18 +993,28 @@
return self._findJobsToRun(tree.job_trees, mutex)
def _findJobsToRequest(self, job_trees):
+ build_set = self.current_build_set
toreq = []
+ if self.item_ahead:
+ if self.item_ahead.isHoldingFollowingChanges():
+ return []
for tree in job_trees:
job = tree.job
+ result = None
if job:
if not job.changeMatches(self.change):
continue
- nodeset = self.current_build_set.getJobNodeSet(job.name)
- if nodeset is None:
- req = self.current_build_set.getJobNodeRequest(job.name)
- if req is None:
- toreq.append(job)
- toreq.extend(self._findJobsToRequest(tree.job_trees))
+ build = build_set.getBuild(job.name)
+ if build:
+ result = build.result
+ else:
+ nodeset = build_set.getJobNodeSet(job.name)
+ if nodeset is None:
+ req = build_set.getJobNodeRequest(job.name)
+ if req is None:
+ toreq.append(job)
+ if result == 'SUCCESS' or not job:
+ toreq.extend(self._findJobsToRequest(tree.job_trees))
return toreq
def findJobsToRequest(self):
@@ -1022,6 +1036,12 @@
fakebuild.result = 'SKIPPED'
self.addBuild(fakebuild)
+ def setNodeRequestFailure(self, job):
+ fakebuild = Build(job, None)
+ self.addBuild(fakebuild)
+ fakebuild.result = 'NODE_FAILURE'
+ self.setResult(fakebuild)
+
def setDequeuedNeedingChange(self):
self.dequeued_needing_change = True
self._setAllJobsSkipped()
diff --git a/zuul/nodepool.py b/zuul/nodepool.py
index b7be94f..8c944cc 100644
--- a/zuul/nodepool.py
+++ b/zuul/nodepool.py
@@ -98,8 +98,8 @@
if request.uid not in self.requests:
return False
- if request.state == model.STATE_FULFILLED:
- self.log.info("Node request %s fulfilled" % (request,))
+ if request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
+ self.log.info("Node request %s %s" % (request, request.state))
# Give our results to the scheduler.
self.sched.onNodesProvisioned(request)
@@ -119,17 +119,18 @@
self.log.info("Accepting node request %s" % (request,))
- # First, try to lock the nodes.
locked = False
- try:
- self.lockNodeset(request.nodeset)
- locked = True
- except Exception:
- self.log.exception("Error locking nodes:")
- request.failed = True
+ if request.fulfilled:
+ # If the request suceeded, try to lock the nodes.
+ try:
+ self.lockNodeset(request.nodeset)
+ locked = True
+ except Exception:
+ self.log.exception("Error locking nodes:")
+ request.failed = True
- # Regardless of whether locking succeeded, delete the
- # request.
+ # Regardless of whether locking (or even the request)
+ # succeeded, delete the request.
self.log.debug("Deleting node request %s" % (request,))
try:
self.sched.zk.deleteNodeRequest(request)
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 5f51cbf..5e49f20 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -811,22 +811,19 @@
request = event.request
build_set = request.build_set
- try:
- self.nodepool.acceptNodes(request)
- except Exception:
- self.log.exception("Unable to accept nodes from request %s:"
- % (request,))
- return
+ self.nodepool.acceptNodes(request)
if build_set is not build_set.item.current_build_set:
self.log.warning("Build set %s is not current" % (build_set,))
- self.nodepool.returnNodeset(request.nodeset)
+ if request.fulfilled:
+ self.nodepool.returnNodeset(request.nodeset)
return
pipeline = build_set.item.pipeline
if not pipeline:
self.log.warning("Build set %s is not associated with a pipeline" %
(build_set,))
- self.nodepool.returnNodeset(request.nodeset)
+ if request.fulfilled:
+ self.nodepool.returnNodeset(request.nodeset)
return
pipeline.manager.onNodesProvisioned(event)