Add post-timeout setting

This adds a post-timeout setting which applies as a timeout to each
post-run playbook. This is separate and independent of the normal job
timeout which now only applies to the pre-run and run playbooks in a
cumulative fashion.

The reason for this change is when a pre-run or run playbook hits the
timeout and the job fails you still want to do your best to copy all of
the log data that you can find so that you can debug the timeout.
Similarly to timeout, if post-timeout is not set then post-run playbooks
will have no timeout and can run indefinitely.

Change-Id: I830a6a14d2623f50fbc3f05396cc909d79de04bb
diff --git a/doc/source/user/config.rst b/doc/source/user/config.rst
index 8492423..7d2dae1 100644
--- a/doc/source/user/config.rst
+++ b/doc/source/user/config.rst
@@ -710,6 +710,21 @@
       timeout is supplied, the job may run indefinitely.  Supplying a
       timeout is highly recommended.
 
+      This timeout only applies to the pre-run and run playbooks in a
+      job.
+
+   .. attr:: post-timeout
+
+      The time in seconds that each post playbook should be allowed to run
+      before it is automatically aborted and failure is reported.  If no
+      post-timeout is supplied, the job may run indefinitely.  Supplying a
+      post-timeout is highly recommended.
+
+      The post-timeout is handled separately from the above timeout because
+      the post playbooks are typically where you will copy jobs logs.
+      In the event of the pre-run or run playbooks timing out we want to
+      do our best to copy the job logs in the post-run playbooks.
+
    .. attr:: attempts
       :default: 3
 
diff --git a/doc/source/user/jobs.rst b/doc/source/user/jobs.rst
index 820e316..4e1c33d 100644
--- a/doc/source/user/jobs.rst
+++ b/doc/source/user/jobs.rst
@@ -289,6 +289,10 @@
 
       The job timeout, in seconds.
 
+   .. var:: post_timeout
+
+      The post-run playbook timeout, in seconds.
+
    .. var:: jobtags
 
       A list of tags associated with the job.  Not to be confused with
diff --git a/tests/fixtures/config/ansible/git/common-config/zuul.yaml b/tests/fixtures/config/ansible/git/common-config/zuul.yaml
index d0a8f7b..0112141 100644
--- a/tests/fixtures/config/ansible/git/common-config/zuul.yaml
+++ b/tests/fixtures/config/ansible/git/common-config/zuul.yaml
@@ -99,6 +99,12 @@
 
 - job:
     parent: python27
+    name: post-timeout
+    post-run: playbooks/timeout.yaml
+    post-timeout: 1
+
+- job:
+    parent: python27
     name: check-vars
     run: playbooks/check-vars.yaml
     nodeset:
diff --git a/tests/fixtures/config/ansible/git/org_project/.zuul.yaml b/tests/fixtures/config/ansible/git/org_project/.zuul.yaml
index 447f6cd..a1e144f 100644
--- a/tests/fixtures/config/ansible/git/org_project/.zuul.yaml
+++ b/tests/fixtures/config/ansible/git/org_project/.zuul.yaml
@@ -17,5 +17,6 @@
         - check-vars
         - check-secret-names
         - timeout
+        - post-timeout
         - hello-world
         - failpost
diff --git a/tests/unit/test_v3.py b/tests/unit/test_v3.py
index 1338d20..e36c8f6 100755
--- a/tests/unit/test_v3.py
+++ b/tests/unit/test_v3.py
@@ -2048,6 +2048,12 @@
         build_timeout = self.getJobFromHistory('timeout')
         with self.jobLog(build_timeout):
             self.assertEqual(build_timeout.result, 'TIMED_OUT')
+            post_flag_path = os.path.join(self.test_root, build_timeout.uuid +
+                                          '.post.flag')
+            self.assertTrue(os.path.exists(post_flag_path))
+        build_post_timeout = self.getJobFromHistory('post-timeout')
+        with self.jobLog(build_post_timeout):
+            self.assertEqual(build_post_timeout.result, 'POST_FAILURE')
         build_faillocal = self.getJobFromHistory('faillocal')
         with self.jobLog(build_faillocal):
             self.assertEqual(build_faillocal.result, 'FAILURE')
diff --git a/zuul/configloader.py b/zuul/configloader.py
index f0f78b7..d251106 100644
--- a/zuul/configloader.py
+++ b/zuul/configloader.py
@@ -491,6 +491,7 @@
                       # validation happens in NodeSetParser
                       'nodeset': vs.Any(dict, str),
                       'timeout': int,
+                      'post-timeout': int,
                       'attempts': int,
                       'pre-run': to_list(str),
                       'post-run': to_list(str),
@@ -518,6 +519,7 @@
         'abstract',
         'protected',
         'timeout',
+        'post-timeout',
         'workspace',
         'voting',
         'hold-following-changes',
@@ -627,6 +629,10 @@
            int(conf['timeout']) > tenant.max_job_timeout:
             raise MaxTimeoutError(job, tenant)
 
+        if conf.get('post-timeout') and tenant.max_job_timeout != -1 and \
+           int(conf['post-timeout']) > tenant.max_job_timeout:
+            raise MaxTimeoutError(job, tenant)
+
         if 'post-review' in conf:
             if conf['post-review']:
                 job.post_review = True
diff --git a/zuul/executor/client.py b/zuul/executor/client.py
index d561232..c09d4e1 100644
--- a/zuul/executor/client.py
+++ b/zuul/executor/client.py
@@ -186,6 +186,7 @@
         params = dict()
         params['job'] = job.name
         params['timeout'] = job.timeout
+        params['post_timeout'] = job.post_timeout
         params['items'] = merger_items
         params['projects'] = []
         if hasattr(item.change, 'branch'):
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index 8de6fe0..fbc34c4 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -878,8 +878,10 @@
         success = False
         self.started = True
         time_started = time.time()
-        # timeout value is total job timeout or put another way
-        # the cummulative time that pre, run, and post can consume.
+        # timeout value is "total" job timeout which accounts for
+        # pre-run and run playbooks. post-run is different because
+        # it is used to copy out job logs and we want to do our best
+        # to copy logs even when the job has timed out.
         job_timeout = args['timeout']
         for index, playbook in enumerate(self.jobdir.pre_playbooks):
             # TODOv3(pabelanger): Implement pre-run timeout setting.
@@ -914,11 +916,15 @@
                 # run it again.
                 return None
 
+        post_timeout = args['post_timeout']
         for index, playbook in enumerate(self.jobdir.post_playbooks):
-            # TODOv3(pabelanger): Implement post-run timeout setting.
-            ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout)
+            # Post timeout operates a little differently to the main job
+            # timeout. We give each post playbook the full post timeout to
+            # do its job because post is where you'll often record job logs
+            # which are vital to understanding why timeouts have happened in
+            # the first place.
             post_status, post_code = self.runAnsiblePlaybook(
-                playbook, ansible_timeout, success, phase='post', index=index)
+                playbook, post_timeout, success, phase='post', index=index)
             if post_status == self.RESULT_ABORTED:
                 return 'ABORTED'
             if post_status != self.RESULT_NORMAL or post_code != 0:
diff --git a/zuul/model.py b/zuul/model.py
index 45fc1a8..ff84048 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -839,6 +839,7 @@
         self.execution_attributes = dict(
             parent=None,
             timeout=None,
+            post_timeout=None,
             variables={},
             nodeset=NodeSet(),
             workspace=None,