Add attempts logic for jobs

Today, if a job is aborted, zuul will launch said job until success /
failure.  If the job continues to abort, it will loop forever.  As a
result, we now added the ability to limit this.  By default we'll try
to relaunch an aborted job a total of 3 times, before RETRY_LIMIT is
returned as the result.

Change-Id: Ie26fdc29c07430ebfb3df8be8ac1786d63d7e0fe
Signed-off-by: Paul Belanger <pabelanger@redhat.com>
diff --git a/doc/source/zuul.rst b/doc/source/zuul.rst
index 2285ecb..e8279d9 100644
--- a/doc/source/zuul.rst
+++ b/doc/source/zuul.rst
@@ -803,6 +803,11 @@
   Boolean value (``true`` or ``false``) that indicates whatever
   a job is voting or not.  Default: ``true``.
 
+**attempts (optional)**
+  Number of attempts zuul will launch a job. Once reached, zuul will report
+  RETRY_LIMIT as the job result.
+  Defaults to 3.
+
 **tags (optional)**
   A list of arbitrary strings which will be associated with the job.
   Can be used by the parameter-function to alter behavior based on
diff --git a/tests/base.py b/tests/base.py
index c5b5b78..a14b4a9 100755
--- a/tests/base.py
+++ b/tests/base.py
@@ -540,6 +540,7 @@
         self.wait_condition = threading.Condition()
         self.waiting = False
         self.aborted = False
+        self.requeue = False
         self.created = time.time()
         self.description = ''
         self.run_error = False
@@ -602,6 +603,8 @@
             result = 'FAILURE'
         if self.aborted:
             result = 'ABORTED'
+        if self.requeue:
+            result = None
 
         if self.run_error:
             work_fail = True
diff --git a/tests/fixtures/layout-abort-attempts.yaml b/tests/fixtures/layout-abort-attempts.yaml
new file mode 100644
index 0000000..86d9d78
--- /dev/null
+++ b/tests/fixtures/layout-abort-attempts.yaml
@@ -0,0 +1,30 @@
+pipelines:
+  - name: check
+    manager: IndependentPipelineManager
+    trigger:
+      gerrit:
+        - event: patchset-created
+    success:
+      gerrit:
+        verified: 1
+    failure:
+      gerrit:
+        verified: -1
+
+  - name: post
+    manager: IndependentPipelineManager
+    trigger:
+      gerrit:
+        - event: ref-updated
+          ref: ^(?!refs/).*$
+
+jobs:
+  - name: project-test1
+    attempts: 4
+
+projects:
+  - name: org/project
+    check:
+      - project-merge:
+        - project-test1
+        - project-test2
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 335f987..b6fa4a3 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -4481,3 +4481,36 @@
         self.assertIn(
             '- docs-draft-test2 https://server/job/docs-draft-test2/1/',
             body[3])
+
+    def test_rerun_on_abort(self):
+        "Test that if a worker fails to run a job, it is run again"
+
+        self.config.set('zuul', 'layout_config',
+                        'tests/fixtures/layout-abort-attempts.yaml')
+        self.sched.reconfigure(self.config)
+        self.worker.hold_jobs_in_build = True
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
+        self.waitUntilSettled()
+
+        self.worker.release('.*-merge')
+        self.waitUntilSettled()
+
+        self.assertEqual(len(self.builds), 2)
+        self.builds[0].requeue = True
+        self.worker.release('.*-test*')
+        self.waitUntilSettled()
+
+        for x in range(3):
+            self.assertEqual(len(self.builds), 1)
+            self.builds[0].requeue = True
+            self.worker.release('.*-test1')
+            self.waitUntilSettled()
+
+        self.worker.hold_jobs_in_build = False
+        self.worker.release()
+        self.waitUntilSettled()
+        self.assertEqual(len(self.history), 6)
+        self.assertEqual(self.countJobResults(self.history, 'SUCCESS'), 2)
+        self.assertEqual(A.reported, 1)
+        self.assertIn('RETRY_LIMIT', A.messages[0])
diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py
index 02f78fd..2840ba6 100644
--- a/zuul/launcher/gearman.py
+++ b/zuul/launcher/gearman.py
@@ -367,6 +367,12 @@
             self.onBuildCompleted(gearman_job, 'NOT_REGISTERED')
             return build
 
+        # NOTE(pabelanger): Rather then looping forever, check to see if job
+        # has passed attempts limit.
+        if item.current_build_set.getTries(job.name) > job.attempts:
+            self.onBuildCompleted(gearman_job, 'RETRY_LIMIT')
+            return build
+
         if pipeline.precedence == zuul.model.PRECEDENCE_NORMAL:
             precedence = gear.PRECEDENCE_NORMAL
         elif pipeline.precedence == zuul.model.PRECEDENCE_HIGH:
diff --git a/zuul/layoutvalidator.py b/zuul/layoutvalidator.py
index e1e8ac6..91e15d1 100644
--- a/zuul/layoutvalidator.py
+++ b/zuul/layoutvalidator.py
@@ -103,6 +103,7 @@
            'success-pattern': str,
            'hold-following-changes': bool,
            'voting': bool,
+           'attempts': int,
            'mutex': str,
            'tags': toList(str),
            'parameter-function': str,
diff --git a/zuul/model.py b/zuul/model.py
index 46b0b98..b24a06b 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -466,6 +466,8 @@
         self._files = []
         self.skip_if_matcher = None
         self.swift = {}
+        # Number of attempts to launch a job before giving up.
+        self.attempts = 3
 
     def __str__(self):
         return self.name
@@ -646,6 +648,7 @@
         self.unable_to_merge = False
         self.failing_reasons = []
         self.merge_state = self.NEW
+        self.tries = {}
 
     def __repr__(self):
         return '<BuildSet item: %s #builds: %s merge state: %s>' % (
@@ -671,9 +674,12 @@
 
     def addBuild(self, build):
         self.builds[build.job.name] = build
+        if build.job.name not in self.tries:
+            self.tries[build.job.name] = 1
         build.build_set = self
 
     def removeBuild(self, build):
+        self.tries[build.job.name] += 1
         del self.builds[build.job.name]
 
     def getBuild(self, job_name):
@@ -684,6 +690,9 @@
         keys.sort()
         return [self.builds.get(x) for x in keys]
 
+    def getTries(self, job_name):
+        return self.tries.get(job_name)
+
 
 class QueueItem(object):
     """A changish inside of a Pipeline queue"""
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index b52931e..8c26541 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -529,6 +529,7 @@
             m = config_job.get('hold-following-changes', False)
             if m:
                 job.hold_following_changes = True
+            job.attempts = config_job.get('attempts', 3)
             m = config_job.get('voting', None)
             if m is not None:
                 job.voting = m