Fix bug with removing a failed job
A recent change in I96195cb9996b01075e3705b6cd89a9863528898e added
code to re-set build states for changes during a re-enqueue to
ensure that any side effects (such as marking new builds as skipped)
occurred. However, we have found that if a change has a
non-succeeding build and the job for that build is removed, then
this new code will run and attempt to perform a lookup on the removed
job, fail, and raise an exception that aborts reconfiguration.
Correct this by fully removing builds from their build sets when
the corresponding job is removed so that no further processing
happens on them.
Change-Id: I2d5cc8f4c5b73d6a2b184cd21016943b8c08949d
diff --git a/tests/fixtures/layout-live-reconfiguration-failed-job.yaml b/tests/fixtures/layout-live-reconfiguration-failed-job.yaml
new file mode 100644
index 0000000..e811af1
--- /dev/null
+++ b/tests/fixtures/layout-live-reconfiguration-failed-job.yaml
@@ -0,0 +1,25 @@
+pipelines:
+ - name: check
+ manager: IndependentPipelineManager
+ trigger:
+ gerrit:
+ - event: patchset-created
+ success:
+ gerrit:
+ verified: 1
+ failure:
+ gerrit:
+ verified: -1
+
+jobs:
+ - name: ^.*-merge$
+ failure-message: Unable to merge change
+ hold-following-changes: true
+
+projects:
+ - name: org/project
+ merge-mode: cherry-pick
+ check:
+ - project-merge:
+ - project-test2
+ - project-testfile
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 8347f7a..baf8881 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -2325,7 +2325,7 @@
'SUCCESS')
self.assertEqual(len(self.history), 4)
- def test_live_reconfiguration_failed_job(self):
+ def test_live_reconfiguration_failed_root(self):
# An extrapolation of test_live_reconfiguration_merge_conflict
# that tests a job added to a job tree with a failed root does
# not run.
@@ -2387,6 +2387,59 @@
self.assertEqual(self.history[4].result, 'SUCCESS')
self.assertEqual(len(self.history), 5)
+ def test_live_reconfiguration_failed_job(self):
+ # Test that a change with a removed failing job does not
+ # disrupt reconfiguration. If a change has a failed job and
+ # that job is removed during a reconfiguration, we observed a
+ # bug where the code to re-set build statuses would run on
+ # that build and raise an exception because the job no longer
+ # existed.
+ self.worker.hold_jobs_in_build = True
+
+ A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+
+ # This change will fail and later be removed by the reconfiguration.
+ self.worker.addFailTest('project-test1', A)
+
+ self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
+ self.waitUntilSettled()
+ self.worker.release('.*-merge')
+ self.waitUntilSettled()
+ self.worker.release('project-test1')
+ self.waitUntilSettled()
+
+ self.assertEqual(A.data['status'], 'NEW')
+ self.assertEqual(A.reported, 0)
+
+ self.assertEqual(self.getJobFromHistory('project-merge').result,
+ 'SUCCESS')
+ self.assertEqual(self.getJobFromHistory('project-test1').result,
+ 'FAILURE')
+ self.assertEqual(len(self.history), 2)
+
+ # Remove the test1 job.
+ self.config.set('zuul', 'layout_config',
+ 'tests/fixtures/layout-live-'
+ 'reconfiguration-failed-job.yaml')
+ self.sched.reconfigure(self.config)
+ self.waitUntilSettled()
+
+ self.worker.hold_jobs_in_build = False
+ self.worker.release()
+ self.waitUntilSettled()
+
+ self.assertEqual(self.getJobFromHistory('project-test2').result,
+ 'SUCCESS')
+ self.assertEqual(self.getJobFromHistory('project-testfile').result,
+ 'SUCCESS')
+ self.assertEqual(len(self.history), 4)
+
+ self.assertEqual(A.data['status'], 'NEW')
+ self.assertEqual(A.reported, 1)
+ self.assertIn('Build succeeded', A.messages[0])
+ # Ensure the removed job was not included in the report.
+ self.assertNotIn('project-test1', A.messages[0])
+
def test_live_reconfiguration_functions(self):
"Test live reconfiguration with a custom function"
self.worker.registerFunction('build:node-project-test1:debian')