Fix deadlock with nonexistent jobs.
When Zuul tried to launch a non-existent jenkins job, the path
to mark the job as LOST involves calling back into the scheduler
from the launcher while holding the queue lock. That deadlocks,
so instead, spawn a thread to report the job as lost.
Change-Id: I6a5661638e7edba1c56eb0b3d6384283b3ecc4ed
Reviewed-on: https://review.openstack.org/12889
Approved: James E. Blair <corvus@inaugust.com>
Reviewed-by: James E. Blair <corvus@inaugust.com>
Tested-by: Jenkins
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 568c85d..cacf4a9 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -460,6 +460,7 @@
self.hold_jobs_in_queue = False
self.hold_jobs_in_build = False
self.fail_tests = {}
+ self.nonexistent_jobs = []
def fakeEnqueue(self, job):
self.queue.append(job)
@@ -505,6 +506,8 @@
return False
def build_job(self, name, parameters):
+ if name in self.nonexistent_jobs:
+ raise Exception("Job does not exist")
count = self.job_counter.get(name, 0)
count += 1
self.job_counter[name] = count
@@ -1530,3 +1533,39 @@
assert A.data['status'] == 'MERGED'
assert A.reported == 2
self.assertEmptyQueues()
+
+ def test_nonexistent_job(self):
+ "Test launching a job that doesn't exist"
+ self.fake_jenkins.nonexistent_jobs.append('project-merge')
+ self.jenkins.launch_retry_timeout = 0.1
+
+ A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+ A.addApproval('CRVW', 2)
+ self.fake_gerrit.addEvent(A.addApproval('APRV', 1))
+ # There may be a thread about to report a lost change
+ while A.reported < 2:
+ self.waitUntilSettled()
+ jobs = self.fake_jenkins.job_history
+ job_names = [x['name'] for x in jobs]
+ assert not job_names
+ assert A.data['status'] == 'NEW'
+ assert A.reported == 2
+ self.assertEmptyQueues()
+
+ # Make sure things still work:
+ self.fake_jenkins.nonexistent_jobs = []
+ A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+ A.addApproval('CRVW', 2)
+ self.fake_gerrit.addEvent(A.addApproval('APRV', 1))
+ self.waitUntilSettled()
+ jobs = self.fake_jenkins.job_history
+ job_names = [x['name'] for x in jobs]
+ assert 'project-merge' in job_names
+ assert 'project-test1' in job_names
+ assert 'project-test2' in job_names
+ assert jobs[0]['result'] == 'SUCCESS'
+ assert jobs[1]['result'] == 'SUCCESS'
+ assert jobs[2]['result'] == 'SUCCESS'
+ assert A.data['status'] == 'MERGED'
+ assert A.reported == 2
+ self.assertEmptyQueues()
diff --git a/zuul/launcher/jenkins.py b/zuul/launcher/jenkins.py
index 19396ae..6fd4ca4 100644
--- a/zuul/launcher/jenkins.py
+++ b/zuul/launcher/jenkins.py
@@ -190,6 +190,7 @@
class Jenkins(object):
log = logging.getLogger("zuul.Jenkins")
+ launch_retry_timeout = 5
def __init__(self, config, sched):
self.sched = sched
@@ -267,7 +268,7 @@
self.log.exception("Exception launching build %s for "
"job %s for change %s (will retry):" %
(build, job, change))
- time.sleep(5)
+ time.sleep(self.launch_retry_timeout)
if errored:
if launched:
@@ -277,10 +278,16 @@
"declaring lost" % build)
# To keep the queue moving, declare this as a lost build
# so that the change will get dropped.
- self.onBuildCompleted(build.uuid, 'LOST', None, None)
-
+ t = threading.Thread(target=self.declareBuildLost,
+ args=(build,))
+ t.start()
return build
+ def declareBuildLost(self, build):
+ # Call this from a new thread to invoke onBuildCompleted from
+ # a thread that has the queue lock.
+ self.onBuildCompleted(build.uuid, 'LOST', None, None)
+
def findBuildInQueue(self, build):
for item in self.jenkins.get_queue_info():
if 'actions' not in item:
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 4b205f1..329d0c4 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -707,6 +707,7 @@
result = job.success_message
elif result == 'FAILURE' and job.failure_message:
result = job.failure_message
+ url = None
if build.url:
if pattern:
url = pattern.format(change=changeish,