Fix deadlock with nonexistent jobs.

When Zuul tried to launch a non-existent jenkins job, the path
to mark the job as LOST involves calling back into the scheduler
from the launcher while holding the queue lock.  That deadlocks,
so instead, spawn a thread to report the job as lost.

Change-Id: I6a5661638e7edba1c56eb0b3d6384283b3ecc4ed
Reviewed-on: https://review.openstack.org/12889
Approved: James E. Blair <corvus@inaugust.com>
Reviewed-by: James E. Blair <corvus@inaugust.com>
Tested-by: Jenkins
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 568c85d..cacf4a9 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -460,6 +460,7 @@
         self.hold_jobs_in_queue = False
         self.hold_jobs_in_build = False
         self.fail_tests = {}
+        self.nonexistent_jobs = []
 
     def fakeEnqueue(self, job):
         self.queue.append(job)
@@ -505,6 +506,8 @@
         return False
 
     def build_job(self, name, parameters):
+        if name in self.nonexistent_jobs:
+            raise Exception("Job does not exist")
         count = self.job_counter.get(name, 0)
         count += 1
         self.job_counter[name] = count
@@ -1530,3 +1533,39 @@
         assert A.data['status'] == 'MERGED'
         assert A.reported == 2
         self.assertEmptyQueues()
+
+    def test_nonexistent_job(self):
+        "Test launching a job that doesn't exist"
+        self.fake_jenkins.nonexistent_jobs.append('project-merge')
+        self.jenkins.launch_retry_timeout = 0.1
+
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        A.addApproval('CRVW', 2)
+        self.fake_gerrit.addEvent(A.addApproval('APRV', 1))
+        # There may be a thread about to report a lost change
+        while A.reported < 2:
+            self.waitUntilSettled()
+        jobs = self.fake_jenkins.job_history
+        job_names = [x['name'] for x in jobs]
+        assert not job_names
+        assert A.data['status'] == 'NEW'
+        assert A.reported == 2
+        self.assertEmptyQueues()
+
+        # Make sure things still work:
+        self.fake_jenkins.nonexistent_jobs = []
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        A.addApproval('CRVW', 2)
+        self.fake_gerrit.addEvent(A.addApproval('APRV', 1))
+        self.waitUntilSettled()
+        jobs = self.fake_jenkins.job_history
+        job_names = [x['name'] for x in jobs]
+        assert 'project-merge' in job_names
+        assert 'project-test1' in job_names
+        assert 'project-test2' in job_names
+        assert jobs[0]['result'] == 'SUCCESS'
+        assert jobs[1]['result'] == 'SUCCESS'
+        assert jobs[2]['result'] == 'SUCCESS'
+        assert A.data['status'] == 'MERGED'
+        assert A.reported == 2
+        self.assertEmptyQueues()
diff --git a/zuul/launcher/jenkins.py b/zuul/launcher/jenkins.py
index 19396ae..6fd4ca4 100644
--- a/zuul/launcher/jenkins.py
+++ b/zuul/launcher/jenkins.py
@@ -190,6 +190,7 @@
 
 class Jenkins(object):
     log = logging.getLogger("zuul.Jenkins")
+    launch_retry_timeout = 5
 
     def __init__(self, config, sched):
         self.sched = sched
@@ -267,7 +268,7 @@
                 self.log.exception("Exception launching build %s for "
                                    "job %s for change %s (will retry):" %
                                    (build, job, change))
-                time.sleep(5)
+                time.sleep(self.launch_retry_timeout)
 
         if errored:
             if launched:
@@ -277,10 +278,16 @@
                                "declaring lost" % build)
                 # To keep the queue moving, declare this as a lost build
                 # so that the change will get dropped.
-                self.onBuildCompleted(build.uuid, 'LOST', None, None)
-
+                t = threading.Thread(target=self.declareBuildLost,
+                                     args=(build,))
+                t.start()
         return build
 
+    def declareBuildLost(self, build):
+        # Call this from a new thread to invoke onBuildCompleted from
+        # a thread that has the queue lock.
+        self.onBuildCompleted(build.uuid, 'LOST', None, None)
+
     def findBuildInQueue(self, build):
         for item in self.jenkins.get_queue_info():
             if 'actions' not in item:
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 4b205f1..329d0c4 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -707,6 +707,7 @@
                     result = job.success_message
                 elif result == 'FAILURE' and job.failure_message:
                     result = job.failure_message
+                url = None
                 if build.url:
                     if pattern:
                         url = pattern.format(change=changeish,