On reconfiguration, re-enqueue items at the same position Upon reconfiguration, we currently re-enqueue every item back into its pipeline, in case a difference in the configuration would change what should occur. In the simple case, this is fine, but if a dependent pipeline has a branching dependency structure due to already failing jobs, we would erroneously re-order the changes back into a linear arrangement. The next pass through the pipeline manager would move those items back to where they should be, but in doing so, would reset their builds. To correct this, after re-enqueueing a change, if the change that was previously ahead of it in the pipeline was also successfully re-enqueued, immediately move the change behind it (or if the change ahead was "None" meaning it was its own head, move it behind no change). This should have the effect of putting changes back where they were in relation to other failing changes. If the change ahead was not successfully re-enqueued, the current behavior of simply putting it behind the nearest change in the queue is preserved. If anything more complex happens, any errors will be corrected on the next pass through the pipeline manager. Change-Id: Ie3771d9bbbc1ca77425cf62751d8e5f70ba1f14c

commit: 872738fc33c51749af4d364a39b1c3bf4906ab48 [log] [tgz]
author: James E. Blair <jeblair@redhat.com> Tue Oct 31 16:40:36 2017 -0700
committer: James E. Blair <jeblair@redhat.com> Tue Oct 31 17:01:09 2017 -0700
tree: f14c5b477ac687098094bf8a7ec84a484ee94dc8
parent: a1fa9aa1c624f302a06a91c3bcb2e608ccc93604 [diff]
diff --git a/tests/fixtures/layouts/reconfigure-failed-head.yaml b/tests/fixtures/layouts/reconfigure-failed-head.yaml
new file mode 100644
index 0000000..3347c9b
--- /dev/null
+++ b/tests/fixtures/layouts/reconfigure-failed-head.yaml

@@ -0,0 +1,56 @@
+- pipeline:
+    name: check
+    manager: independent
+    trigger:
+      gerrit:
+        - event: patchset-created
+    success:
+      gerrit:
+        Verified: 1
+    failure:
+      gerrit:
+        Verified: -1
+
+- pipeline:
+    name: gate
+    manager: dependent
+    success-message: Build succeeded (gate).
+    trigger:
+      gerrit:
+        - event: comment-added
+          approval:
+            - Approved: 1
+    success:
+      gerrit:
+        Verified: 2
+        submit: true
+    failure:
+      gerrit:
+        Verified: -2
+    start:
+      gerrit:
+        Verified: 0
+    precedence: high
+
+- job:
+    name: base
+    parent: null
+
+- job:
+    name: job1
+    run: playbooks/job1.yaml
+
+- job:
+    name: job2
+    run: playbooks/job2.yaml
+
+- project:
+    name: org/project
+    check:
+      jobs:
+        - job1
+        - job2
+    gate:
+      jobs:
+        - job1
+        - job2

diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index b0df2b2..53a20ff 100755
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py

@@ -3034,6 +3034,70 @@
         self.assertEqual(len(tenant.layout.pipelines['check'].queues), 0)
         self.assertIn('Build succeeded', A.messages[0])
 
+    @simple_layout("layouts/reconfigure-failed-head.yaml")
+    def test_live_reconfiguration_failed_change_at_head(self):
+        # Test that if we reconfigure with a failed change at head,
+        # that the change behind it isn't reparented onto it.
+
+        self.executor_server.hold_jobs_in_build = True
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        A.addApproval('Code-Review', 2)
+        B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
+        B.addApproval('Code-Review', 2)
+
+        self.executor_server.failJob('job1', A)
+
+        self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
+        self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
+
+        self.waitUntilSettled()
+
+        self.assertBuilds([
+            dict(name='job1', changes='1,1'),
+            dict(name='job2', changes='1,1'),
+            dict(name='job1', changes='1,1 2,1'),
+            dict(name='job2', changes='1,1 2,1'),
+        ])
+
+        self.release(self.builds[0])
+        self.waitUntilSettled()
+
+        self.assertBuilds([
+            dict(name='job2', changes='1,1'),
+            dict(name='job1', changes='2,1'),
+            dict(name='job2', changes='2,1'),
+        ])
+
+        # Unordered history comparison because the aborts can finish
+        # in any order.
+        self.assertHistory([
+            dict(name='job1', result='FAILURE', changes='1,1'),
+            dict(name='job1', result='ABORTED', changes='1,1 2,1'),
+            dict(name='job2', result='ABORTED', changes='1,1 2,1'),
+        ], ordered=False)
+
+        self.sched.reconfigure(self.config)
+        self.waitUntilSettled()
+
+        self.executor_server.hold_jobs_in_build = False
+        self.executor_server.release()
+        self.waitUntilSettled()
+
+        self.assertBuilds([])
+
+        self.assertHistory([
+            dict(name='job1', result='FAILURE', changes='1,1'),
+            dict(name='job1', result='ABORTED', changes='1,1 2,1'),
+            dict(name='job2', result='ABORTED', changes='1,1 2,1'),
+            dict(name='job2', result='SUCCESS', changes='1,1'),
+            dict(name='job1', result='SUCCESS', changes='2,1'),
+            dict(name='job2', result='SUCCESS', changes='2,1'),
+        ], ordered=False)
+        self.assertEqual(A.data['status'], 'NEW')
+        self.assertEqual(B.data['status'], 'MERGED')
+        self.assertEqual(A.reported, 2)
+        self.assertEqual(B.reported, 2)
+
     def test_delayed_repo_init(self):
         self.init_repo("org/new-project")
         files = {'README': ''}

diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py
index 9c30d74..6c72c2d 100644
--- a/zuul/manager/__init__.py
+++ b/zuul/manager/__init__.py

@@ -223,13 +223,19 @@
             if item.change.equals(change):
                 self.removeItem(item)
 
-    def reEnqueueItem(self, item, last_head):
+    def reEnqueueItem(self, item, last_head, old_item_ahead, item_ahead_valid):
         with self.getChangeQueue(item.change, last_head.queue) as change_queue:
             if change_queue:
                 self.log.debug("Re-enqueing change %s in queue %s" %
                                (item.change, change_queue))
                 change_queue.enqueueItem(item)
 
+                # If the old item ahead was re-enqued, this value will
+                # be true, so we should attempt to move the item back
+                # to where it was in case an item ahead is already
+                # failing.
+                if item_ahead_valid:
+                    change_queue.moveItem(item, old_item_ahead)
                 # Get an updated copy of the layout and update the job
                 # graph if necessary.  This resumes the buildset merge
                 # state machine.  If we have an up-to-date layout, it

diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 73fa923..a725fcd 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py

@@ -614,13 +614,22 @@
                     item.queue = None
                     item.change.project = self._reenqueueGetProject(
                         tenant, item)
+                    # If the old item ahead made it in, re-enqueue
+                    # this one behind it.
+                    if item.item_ahead in items_to_remove:
+                        old_item_ahead = None
+                        item_ahead_valid = False
+                    else:
+                        old_item_ahead = item.item_ahead
+                        item_ahead_valid = True
                     item.item_ahead = None
                     item.items_behind = []
                     reenqueued = False
                     if item.change.project:
                         try:
                             reenqueued = new_pipeline.manager.reEnqueueItem(
-                                item, last_head)
+                                item, last_head, old_item_ahead,
+                                item_ahead_valid=item_ahead_valid)
                         except Exception:
                             self.log.exception(
                                 "Exception while re-enqueing item %s",
commit	872738fc33c51749af4d364a39b1c3bf4906ab48	[log] [tgz]
author	James E. Blair <jeblair@redhat.com>	Tue Oct 31 16:40:36 2017 -0700
committer	James E. Blair <jeblair@redhat.com>	Tue Oct 31 17:01:09 2017 -0700
tree	f14c5b477ac687098094bf8a7ec84a484ee94dc8
parent	a1fa9aa1c624f302a06a91c3bcb2e608ccc93604 [diff]