Merge "Fix bug with removing a failed job"
diff --git a/tests/fixtures/layout-live-reconfiguration-failed-job.yaml b/tests/fixtures/layout-live-reconfiguration-failed-job.yaml
new file mode 100644
index 0000000..e811af1
--- /dev/null
+++ b/tests/fixtures/layout-live-reconfiguration-failed-job.yaml
@@ -0,0 +1,25 @@
+pipelines:
+  - name: check
+    manager: IndependentPipelineManager
+    trigger:
+      gerrit:
+        - event: patchset-created
+    success:
+      gerrit:
+        verified: 1
+    failure:
+      gerrit:
+        verified: -1
+
+jobs:
+  - name: ^.*-merge$
+    failure-message: Unable to merge change
+    hold-following-changes: true
+
+projects:
+  - name: org/project
+    merge-mode: cherry-pick
+    check:
+      - project-merge:
+        - project-test2
+        - project-testfile
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 8347f7a..baf8881 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -2325,7 +2325,7 @@
                          'SUCCESS')
         self.assertEqual(len(self.history), 4)
 
-    def test_live_reconfiguration_failed_job(self):
+    def test_live_reconfiguration_failed_root(self):
         # An extrapolation of test_live_reconfiguration_merge_conflict
         # that tests a job added to a job tree with a failed root does
         # not run.
@@ -2387,6 +2387,59 @@
         self.assertEqual(self.history[4].result, 'SUCCESS')
         self.assertEqual(len(self.history), 5)
 
+    def test_live_reconfiguration_failed_job(self):
+        # Test that a change with a removed failing job does not
+        # disrupt reconfiguration.  If a change has a failed job and
+        # that job is removed during a reconfiguration, we observed a
+        # bug where the code to re-set build statuses would run on
+        # that build and raise an exception because the job no longer
+        # existed.
+        self.worker.hold_jobs_in_build = True
+
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+
+        # This change will fail and later be removed by the reconfiguration.
+        self.worker.addFailTest('project-test1', A)
+
+        self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
+        self.waitUntilSettled()
+        self.worker.release('.*-merge')
+        self.waitUntilSettled()
+        self.worker.release('project-test1')
+        self.waitUntilSettled()
+
+        self.assertEqual(A.data['status'], 'NEW')
+        self.assertEqual(A.reported, 0)
+
+        self.assertEqual(self.getJobFromHistory('project-merge').result,
+                         'SUCCESS')
+        self.assertEqual(self.getJobFromHistory('project-test1').result,
+                         'FAILURE')
+        self.assertEqual(len(self.history), 2)
+
+        # Remove the test1 job.
+        self.config.set('zuul', 'layout_config',
+                        'tests/fixtures/layout-live-'
+                        'reconfiguration-failed-job.yaml')
+        self.sched.reconfigure(self.config)
+        self.waitUntilSettled()
+
+        self.worker.hold_jobs_in_build = False
+        self.worker.release()
+        self.waitUntilSettled()
+
+        self.assertEqual(self.getJobFromHistory('project-test2').result,
+                         'SUCCESS')
+        self.assertEqual(self.getJobFromHistory('project-testfile').result,
+                         'SUCCESS')
+        self.assertEqual(len(self.history), 4)
+
+        self.assertEqual(A.data['status'], 'NEW')
+        self.assertEqual(A.reported, 1)
+        self.assertIn('Build succeeded', A.messages[0])
+        # Ensure the removed job was not included in the report.
+        self.assertNotIn('project-test1', A.messages[0])
+
     def test_live_reconfiguration_functions(self):
         "Test live reconfiguration with a custom function"
         self.worker.registerFunction('build:node-project-test1:debian')
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 4ee430a..41a3a16 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -679,7 +679,7 @@
                     continue
                 self.log.debug("Re-enqueueing changes for pipeline %s" % name)
                 items_to_remove = []
-                builds_to_remove = []
+                builds_to_cancel = []
                 last_head = None
                 for shared_queue in old_pipeline.queues:
                     for item in shared_queue.queue:
@@ -703,14 +703,15 @@
                             if job:
                                 build.job = job
                             else:
-                                builds_to_remove.append(build)
+                                item.removeBuild(build)
+                                builds_to_cancel.append(build)
                         if not new_pipeline.manager.reEnqueueItem(item,
                                                                   last_head):
                             items_to_remove.append(item)
                 for item in items_to_remove:
                     for build in item.current_build_set.getBuilds():
-                        builds_to_remove.append(build)
-                for build in builds_to_remove:
+                        builds_to_cancel.append(build)
+                for build in builds_to_cancel:
                     self.log.warning(
                         "Canceling build %s during reconfiguration" % (build,))
                     try: