Improve safety around canceling node requests

It's possible for a node request to disappear while we are trying
to fulfill it.  That will cause an unhandled exception and the
fake nodepool will stop.

Log all exceptions and continue running.

Also add an exception handler for the specific case encountered.

Also, add a canceled flag to node requests to aid in some race
condiations.  The _updateNodeRequest method can be called
out-of-band by ZK.  It can race with the scheduler calling
cancelRequest, which is significant for deleting the request
from Nodepool.requests as well as handing off a completed request
to the scheduler.

The canceled flag on requests aids in that by allowing us to only
delete from Nodepool.requests within the _updateNodeRequest method.

It also helps us detect when a request was canceled after it was
fulfilled and handed off to the scheduler.  In those cases, the
cancel method will have already deleted the request, but the scheduler
would then procces it and perhaps try to delete it again.  We check
the canceled flag inside of the scheduler main loop.  Since the
scheduler main loop is also the only thread which can call the cancel
method, we can be assured that it won't change while we evaluate it.
If we find that we ended up with a completed request which was canceled,
we just ignore it since we know that the cancel method would have
deleted the request.

Change-Id: I30f854a869d7690ab50340ba4c02067c4ae9fe2b
diff --git a/tests/base.py b/tests/base.py
index 2568ec8..0450463 100755
--- a/tests/base.py
+++ b/tests/base.py
@@ -1364,7 +1364,10 @@
 
     def run(self):
         while self._running:
-            self._run()
+            try:
+                self._run()
+            except Exception:
+                self.log.exception("Error in fake nodepool:")
             time.sleep(0.1)
 
     def _run(self):
@@ -1462,7 +1465,10 @@
         path = self.REQUEST_ROOT + '/' + oid
         data = json.dumps(request)
         self.log.debug("Fulfilling node request: %s %s" % (oid, data))
-        self.client.set(path, data)
+        try:
+            self.client.set(path, data)
+        except kazoo.exceptions.NoNodeError:
+            self.log.debug("Node request %s %s disappeared" % (oid, data))
 
 
 class ChrootedKazooFixture(fixtures.Fixture):
diff --git a/zuul/model.py b/zuul/model.py
index 8002f16..b14ce12 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -490,9 +490,10 @@
         self.stat = None
         self.uid = uuid4().hex
         self.id = None
-        # Zuul internal failure flag (not stored in ZK so it's not
+        # Zuul internal flags (not stored in ZK so they are not
         # overwritten).
         self.failed = False
+        self.canceled = False
 
     @property
     def fulfilled(self):
diff --git a/zuul/nodepool.py b/zuul/nodepool.py
index e94b950..8f6489c 100644
--- a/zuul/nodepool.py
+++ b/zuul/nodepool.py
@@ -38,11 +38,11 @@
     def cancelRequest(self, request):
         self.log.info("Canceling node request %s" % (request,))
         if request.uid in self.requests:
+            request.canceled = True
             try:
                 self.sched.zk.deleteNodeRequest(request)
             except Exception:
                 self.log.exception("Error deleting node request:")
-            del self.requests[request.uid]
 
     def useNodeSet(self, nodeset):
         self.log.info("Setting nodeset %s in use" % (nodeset,))
@@ -98,6 +98,10 @@
         if request.uid not in self.requests:
             return False
 
+        if request.canceled:
+            del self.requests[request.uid]
+            return False
+
         if request.state in (model.STATE_FULFILLED, model.STATE_FAILED):
             self.log.info("Node request %s %s" % (request, request.state))
 
@@ -119,6 +123,11 @@
 
         self.log.info("Accepting node request %s" % (request,))
 
+        if request.canceled:
+            self.log.info("Ignoring canceled node request %s" % (request,))
+            # The request was already deleted when it was canceled
+            return
+
         locked = False
         if request.fulfilled:
             # If the request suceeded, try to lock the nodes.
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 2e9bef2..a67973e 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -871,6 +871,8 @@
         build_set = request.build_set
 
         self.nodepool.acceptNodes(request)
+        if request.canceled:
+            return
 
         if build_set is not build_set.item.current_build_set:
             self.log.warning("Build set %s is not current" % (build_set,))