Verify nodes and requests are not leaked Check that at the end of every test, there are no outstanding nodepool requests and no locked nodes. Move final state assertions into the tearDown method so that they run right after the end of the test but before any cleanup handlers are called (which can interfere with the assertion checking by, say, deleting the zookeeper tree we are trying to check). Move the cleanup in test_webapp to tearDown so that it ends the paused job that the tests in that class use before the assertion check. Fix some bugs uncovered by this testing: * Two typos. * When we re-launch a job, we need a new nodeset, so make sure to remove the nodeset from the buildset after the build completes if we are going to retry the build. * Always report build results to the scheduler even for non-current buildsets so that it can return used nodes for aborted builds. * Have the scheduler return the nodeset for a completed build rather than the pipeline manager to avoid the edge case where a build result is returned after a configuration that removes the pipeline (and therefore, there is no longer a manager to return the nodeset). * When canceling jobs, return nodesets for any jobs which do not yet have builds (such as jobs which have nodes but have not yet launched). * Return nodes for skipped jobs. Normalize the debug messages in nodepool.py. Change-Id: I32f6807ac95034fc2636993824f4a45ffe7c59d8

commit: e18d460e47a9349e4f29754399b2b72bc6073e7a [log] [tgz]
author: James E. Blair <jeblair@redhat.com> Thu Jan 05 11:17:28 2017 -0800
committer: James E. Blair <jeblair@redhat.com> Thu Jan 05 17:28:35 2017 -0800
tree: 71ba3798c7f34c873024243386f588ce2e6e4568
parent: ce001e11a86770e35571bbf6f45d412368fbcd1f [diff]
diff --git a/tests/base.py b/tests/base.py
index f10157d..56c83f2 100755
--- a/tests/base.py
+++ b/tests/base.py

@@ -874,6 +874,7 @@
 
 class FakeNodepool(object):
     REQUEST_ROOT = '/nodepool/requests'
+    NODE_ROOT = '/nodepool/nodes'
 
     log = logging.getLogger("zuul.test.FakeNodepool")
 
@@ -918,6 +919,28 @@
             reqs.append(data)
         return reqs
 
+    def getNodes(self):
+        try:
+            nodeids = self.client.get_children(self.NODE_ROOT)
+        except kazoo.exceptions.NoNodeError:
+            return []
+        nodes = []
+        for oid in sorted(nodeids):
+            path = self.NODE_ROOT + '/' + oid
+            data, stat = self.client.get(path)
+            data = json.loads(data)
+            data['_oid'] = oid
+            try:
+                lockfiles = self.client.get_children(path + '/lock')
+            except kazoo.exceptions.NoNodeError:
+                lockfiles = []
+            if lockfiles:
+                data['_lock'] = True
+            else:
+                data['_lock'] = False
+            nodes.append(data)
+        return nodes
+
     def makeNode(self, request_id, node_type):
         now = time.time()
         path = '/nodepool/nodes/'
@@ -1257,9 +1280,12 @@
         self.rpc.start()
         self.launch_client.gearman.waitForServer()
 
-        self.addCleanup(self.assertFinalState)
         self.addCleanup(self.shutdown)
 
+    def tearDown(self):
+        super(ZuulTestCase, self).tearDown()
+        self.assertFinalState()
+
     def configure_connections(self):
         # Register connections from the config
         self.smtp_messages = []
@@ -1366,6 +1392,17 @@
         self.addCommitToRepo(project, 'add content from fixture',
                              files, branch='master', tag='init')
 
+    def assertNodepoolState(self):
+        # Make sure that there are no pending requests
+
+        requests = self.fake_nodepool.getNodeRequests()
+        self.assertEqual(len(requests), 0)
+
+        nodes = self.fake_nodepool.getNodes()
+        for node in nodes:
+            self.assertFalse(node['_lock'], "Node %s is locked" %
+                             (node['_oid'],))
+
     def assertFinalState(self):
         # Make sure that git.Repo objects have been garbage collected.
         repos = []
@@ -1375,6 +1412,7 @@
                 repos.append(obj)
         self.assertEqual(len(repos), 0)
         self.assertEmptyQueues()
+        self.assertNodepoolState()
         ipm = zuul.manager.independent.IndependentPipelineManager
         for tenant in self.sched.abide.tenants.values():
             for pipeline in tenant.layout.pipelines.values():

diff --git a/tests/test_webapp.py b/tests/test_webapp.py
index e191244..8268ef7 100644
--- a/tests/test_webapp.py
+++ b/tests/test_webapp.py

@@ -26,14 +26,8 @@
 class TestWebapp(ZuulTestCase):
     tenant_config_file = 'config/single-tenant/main.yaml'
 
-    def _cleanup(self):
-        self.launch_server.hold_jobs_in_build = False
-        self.launch_server.release()
-        self.waitUntilSettled()
-
     def setUp(self):
         super(TestWebapp, self).setUp()
-        self.addCleanup(self._cleanup)
         self.launch_server.hold_jobs_in_build = True
         A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
         A.addApproval('code-review', 2)
@@ -44,6 +38,12 @@
         self.waitUntilSettled()
         self.port = self.webapp.server.socket.getsockname()[1]
 
+    def tearDown(self):
+        self.launch_server.hold_jobs_in_build = False
+        self.launch_server.release()
+        self.waitUntilSettled()
+        super(TestWebapp, self).tearDown()
+
     def test_webapp_status(self):
         "Test that we can filter to only certain changes in the webapp."
 

diff --git a/zuul/launcher/client.py b/zuul/launcher/client.py
index e17c83c..9e895ef 100644
--- a/zuul/launcher/client.py
+++ b/zuul/launcher/client.py

@@ -471,14 +471,13 @@
             data = getJobData(job)
             build.node_labels = data.get('node_labels', [])
             build.node_name = data.get('node_name')
-            if not build.canceled:
-                if result is None:
-                    result = data.get('result')
-                if result is None:
-                    build.retry = True
-                self.log.info("Build %s complete, result %s" %
-                              (job, result))
-                self.sched.onBuildCompleted(build, result)
+            if result is None:
+                result = data.get('result')
+            if result is None:
+                build.retry = True
+            self.log.info("Build %s complete, result %s" %
+                          (job, result))
+            self.sched.onBuildCompleted(build, result)
             # The test suite expects the build to be removed from the
             # internal dict after it's added to the report queue.
             del self.builds[job.unique]

diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index 7f64986..0d11316 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py

@@ -395,6 +395,7 @@
         for req in old_build_set.node_requests.values():
             self.sched.nodepool.cancelRequest(req)
         old_build_set.node_requests = {}
+        canceled_jobs = set()
         for build in old_build_set.getBuilds():
             was_running = False
             try:
@@ -405,13 +406,18 @@
             if not was_running:
                 try:
                     nodeset = build.build_set.getJobNodeSet(build.job.name)
-                    self.nodepool.returnNodeset(nodeset)
+                    self.sched.nodepool.returnNodeset(nodeset)
                 except Exception:
                     self.log.exception("Unable to return nodeset %s for "
                                        "canceled build request %s" %
                                        (nodeset, build))
             build.result = 'CANCELED'
             canceled = True
+            canceled_jobs.add(build.job.name)
+        for jobname, nodeset in old_build_set.nodesets.items()[:]:
+            if jobname in canceled_jobs:
+                continue
+            self.sched.nodepool.returnNodeset(nodeset)
         for item_behind in item.items_behind:
             self.log.debug("Canceling jobs for change %s, behind change %s" %
                            (item_behind.change, item.change))
@@ -609,11 +615,15 @@
         self.log.debug("Item %s status is now:\n %s" %
                        (item, item.formatStatus()))
 
-        try:
-            nodeset = build.build_set.getJobNodeSet(build.job.name)
-            self.nodepool.returnNodeset(nodeset)
-        except Exception:
-            self.log.exception("Unable to return nodeset %s" % (nodeset,))
+        if build.retry:
+            build.build_set.removeJobNodeSet(build.job.name)
+
+        # If any jobs were skipped as a result of this build, return
+        # their nodes.
+        for build in build.build_set.getBuilds():
+            if build.result == 'SKIPPED':
+                nodeset = build.build_set.getJobNodeSet(build.job.name)
+                self.sched.nodepool.returnNodeset(nodeset)
 
         return True
 

diff --git a/zuul/model.py b/zuul/model.py
index 2d88eb5..2805c85 100644
--- a/zuul/model.py
+++ b/zuul/model.py

@@ -762,6 +762,11 @@
         # required
         return self.nodesets.get(job_name)
 
+    def removeJobNodeSet(self, job_name):
+        if job_name not in self.nodesets:
+            raise Exception("No job set for %s" % (job_name))
+        del self.nodesets[job_name]
+
     def setJobNodeRequest(self, job_name, req):
         if job_name in self.node_requests:
             raise Exception("Prior node request for %s" % (job_name))

diff --git a/zuul/nodepool.py b/zuul/nodepool.py
index 4d0442f..dead411 100644
--- a/zuul/nodepool.py
+++ b/zuul/nodepool.py

@@ -28,14 +28,15 @@
         nodeset = job.nodeset.copy()
         req = NodeRequest(build_set, job, nodeset)
         self.requests[req.uid] = req
-        self.log.debug("Submitting node request: %s" % (req,))
 
         self.sched.zk.submitNodeRequest(req, self._updateNodeRequest)
+        # Logged after submission so that we have the request id
+        self.log.info("Submited node request %s" % (req,))
 
         return req
 
     def cancelRequest(self, request):
-        self.log.debug("Canceling node request: %s" % (request,))
+        self.log.info("Canceling node request %s" % (request,))
         if request.uid in self.requests:
             try:
                 self.sched.zk.deleteNodeRequest(request)
@@ -44,6 +45,7 @@
             del self.requests[request.uid]
 
     def useNodeset(self, nodeset):
+        self.log.info("Setting nodeset %s in use" % (nodeset,))
         for node in nodeset.getNodes():
             if node.lock is None:
                 raise Exception("Node %s is not locked" % (node,))
@@ -51,6 +53,7 @@
             self.sched.zk.storeNode(node)
 
     def returnNodeset(self, nodeset):
+        self.log.info("Returning nodeset %s" % (nodeset,))
         for node in nodeset.getNodes():
             if node.lock is None:
                 raise Exception("Node %s is not locked" % (node,))
@@ -79,7 +82,7 @@
         locked_nodes = []
         try:
             for node in nodes:
-                self.log.debug("Locking node: %s" % (node,))
+                self.log.debug("Locking node %s" % (node,))
                 self.sched.zk.lockNode(node)
                 locked_nodes.append(node)
         except Exception:
@@ -90,7 +93,7 @@
     def _updateNodeRequest(self, request, deleted):
         # Return False to indicate that we should stop watching the
         # node.
-        self.log.debug("Updating node request: %s" % (request,))
+        self.log.debug("Updating node request %s" % (request,))
 
         if request.uid not in self.requests:
             return False
@@ -114,7 +117,7 @@
         # Called by the scheduler when it wants to accept and lock
         # nodes for (potential) use.
 
-        self.log.debug("Accepting node request: %s" % (request,))
+        self.log.info("Accepting node request %s" % (request,))
 
         # First, try to lock the nodes.
         locked = False
@@ -127,7 +130,7 @@
 
         # Regardless of whether locking succeeded, delete the
         # request.
-        self.log.debug("Deleting node request: %s" % (request,))
+        self.log.debug("Deleting node request %s" % (request,))
         try:
             self.sched.zk.deleteNodeRequest(request)
         except Exception:

diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 4a0a9eb..5f51cbf 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py

@@ -767,9 +767,19 @@
 
     def _doBuildCompletedEvent(self, event):
         build = event.build
+
+        # Regardless of any other conditions which might cause us not
+        # to pass this on to the pipeline manager, make sure we return
+        # the nodes to nodepool.
+        try:
+            nodeset = build.build_set.getJobNodeSet(build.job.name)
+            self.nodepool.returnNodeset(nodeset)
+        except Exception:
+            self.log.exception("Unable to return nodeset %s" % (nodeset,))
+
         if build.build_set is not build.build_set.item.current_build_set:
-            self.log.warning("Build %s is not in the current build set" %
-                             (build,))
+            self.log.debug("Build %s is not in the current build set" %
+                           (build,))
             return
         pipeline = build.build_set.item.pipeline
         if not pipeline:
commit	e18d460e47a9349e4f29754399b2b72bc6073e7a	[log] [tgz]
author	James E. Blair <jeblair@redhat.com>	Thu Jan 05 11:17:28 2017 -0800
committer	James E. Blair <jeblair@redhat.com>	Thu Jan 05 17:28:35 2017 -0800
tree	71ba3798c7f34c873024243386f588ce2e6e4568
parent	ce001e11a86770e35571bbf6f45d412368fbcd1f [diff]