Merge "Cleanly shutdown zuul scheduler if startup fails"

commit: 985bb4467931bf44167c7bc70f4de93b26641943 [log] [tgz]
author: Zuul <zuul@review.openstack.org> Tue Feb 13 21:00:55 2018 +0000
committer: Gerrit Code Review <review@openstack.org> Tue Feb 13 21:00:55 2018 +0000
tree: 60edb44a2665faaaaa5b6c7fd4bb584c4044940e
parent: 9bae3c0b18245f6d732cdae05b208cb05320e333 [diff]
parent: 12c51791c21abe79906fca7953779b2fa50315af [diff]
diff --git a/doc/source/admin/components.rst b/doc/source/admin/components.rst
index 88b898f..ba14752 100644
--- a/doc/source/admin/components.rst
+++ b/doc/source/admin/components.rst

@@ -442,6 +442,11 @@
 
       Port to use for finger log streamer.
 
+   .. attr:: state_dir
+      :default: /var/lib/zuul
+
+      Path to directory in which Zuul should save its state.
+
    .. attr:: git_dir
       :default: /var/lib/zuul/git
 

diff --git a/doc/source/admin/tenants.rst b/doc/source/admin/tenants.rst
index 48e7ba8..5bcd2a2 100644
--- a/doc/source/admin/tenants.rst
+++ b/doc/source/admin/tenants.rst

@@ -25,7 +25,7 @@
 ------
 
 A tenant is a collection of projects which share a Zuul
-configuration.  An example tenant definition is:
+configuration. Some examples of tenant definitions are:
 
 .. code-block:: yaml
 
@@ -46,6 +46,27 @@
              - project2:
                  exclude-unprotected-branches: true
 
+.. code-block:: yaml
+
+   - tenant:
+       name: my-tenant
+       source:
+         gerrit:
+           config-projects:
+             - common-config
+           untrusted-projects:
+             - exclude:
+                 - job
+                 - semaphore
+                 - project
+                 - project-template
+                 - nodeset
+                 - secret
+               projects:
+                 - project1
+                 - project2:
+                     exclude-unprotected-branches: true
+
 .. attr:: tenant
 
    The following attributes are supported:
@@ -157,6 +178,24 @@
             processed. Defaults to the tenant wide setting of
             exclude-unprotected-branches.
 
+      .. attr:: <project-group>
+
+         The items in the list are dictionaries with the following
+         attributes. A **configuration items** definition is applied
+         to the list of projects.
+
+         .. attr:: include
+
+            A list of **configuration items** that should be loaded.
+
+         .. attr:: exclude
+
+            A list of **configuration items** that should not be loaded.
+
+         .. attr:: projects
+
+            A list of **project** items.
+
    .. attr:: max-nodes-per-job
       :default: 5
 

diff --git a/requirements.txt b/requirements.txt
index f24f195..7057c5a 100644
--- a/requirements.txt
+++ b/requirements.txt

@@ -25,6 +25,6 @@
 cachecontrol
 pyjwt
 iso8601
-aiohttp
+aiohttp<3.0.0
 uvloop;python_version>='3.5'
 psutil

diff --git a/tests/base.py b/tests/base.py
index f68f59a..70889bb 100755
--- a/tests/base.py
+++ b/tests/base.py

@@ -1640,6 +1640,10 @@
         nodeid = path.split("/")[-1]
         return nodeid
 
+    def removeNode(self, node):
+        path = self.NODE_ROOT + '/' + node["_oid"]
+        self.client.delete(path, recursive=True)
+
     def addFailRequest(self, request):
         self.fail_requests.add(request['_oid'])
 

diff --git a/tests/fixtures/layouts/timer-github.yaml b/tests/fixtures/layouts/timer-github.yaml
new file mode 100644
index 0000000..4f3efe4
--- /dev/null
+++ b/tests/fixtures/layouts/timer-github.yaml

@@ -0,0 +1,25 @@
+- pipeline:
+    name: periodic
+    manager: independent
+    trigger:
+      timer:
+        - time: '* * * * * */1'
+
+- job:
+    name: base
+    parent: null
+    run: playbooks/base.yaml
+
+- job:
+    name: project-bitrot
+    nodeset:
+      nodes:
+        - name: static
+          label: ubuntu-xenial
+    run: playbooks/project-bitrot.yaml
+
+- project:
+    name: org/project
+    periodic:
+      jobs:
+        - project-bitrot

diff --git a/tests/unit/test_github_driver.py b/tests/unit/test_github_driver.py
index cd36ba3..8978415 100644
--- a/tests/unit/test_github_driver.py
+++ b/tests/unit/test_github_driver.py

@@ -210,6 +210,34 @@
         self.waitUntilSettled()
         self.assertEqual(1, len(self.history))
 
+    @simple_layout('layouts/basic-github.yaml', driver='github')
+    def test_timer_event(self):
+        self.executor_server.hold_jobs_in_build = True
+        self.commitConfigUpdate('org/common-config',
+                                'layouts/timer-github.yaml')
+        self.sched.reconfigure(self.config)
+        time.sleep(2)
+        self.waitUntilSettled()
+        self.assertEqual(len(self.builds), 1)
+        self.executor_server.hold_jobs_in_build = False
+        # Stop queuing timer triggered jobs so that the assertions
+        # below don't race against more jobs being queued.
+        self.commitConfigUpdate('org/common-config',
+                                'layouts/basic-github.yaml')
+        self.sched.reconfigure(self.config)
+        self.waitUntilSettled()
+        # If APScheduler is in mid-event when we remove the job, we
+        # can end up with one more event firing, so give it an extra
+        # second to settle.
+        time.sleep(1)
+        self.waitUntilSettled()
+        self.executor_server.release()
+        self.waitUntilSettled()
+        self.assertHistory([
+            dict(name='project-bitrot', result='SUCCESS',
+                 ref='refs/heads/master'),
+        ], ordered=False)
+
     @simple_layout('layouts/dequeue-github.yaml', driver='github')
     def test_dequeue_pull_synchronized(self):
         self.executor_server.hold_jobs_in_build = True

diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index 9b54084..c833fa2 100755
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py

@@ -1506,7 +1506,7 @@
                                           self.gearman_server.port)
         self.addCleanup(client.shutdown)
         r = client.autohold('tenant-one', 'org/project', 'project-test2',
-                            "reason text", 1)
+                            "", "", "reason text", 1)
         self.assertTrue(r)
 
         # First check that successful jobs do not autohold
@@ -1553,7 +1553,7 @@
             held_node['hold_job'],
             " ".join(['tenant-one',
                       'review.example.com/org/project',
-                      'project-test2'])
+                      'project-test2', '.*'])
         )
         self.assertEqual(held_node['comment'], "reason text")
 
@@ -1573,13 +1573,151 @@
                 held_nodes += 1
         self.assertEqual(held_nodes, 1)
 
+    def _test_autohold_scoped(self, change_obj, change, ref):
+        client = zuul.rpcclient.RPCClient('127.0.0.1',
+                                          self.gearman_server.port)
+        self.addCleanup(client.shutdown)
+
+        # create two changes on the same project, and autohold request
+        # for one of them.
+        other = self.fake_gerrit.addFakeChange(
+            'org/project', 'master', 'other'
+        )
+
+        r = client.autohold('tenant-one', 'org/project', 'project-test2',
+                            str(change), ref, "reason text", 1)
+        self.assertTrue(r)
+
+        # First, check that an unrelated job does not trigger autohold, even
+        # when it failed
+        self.executor_server.failJob('project-test2', other)
+        self.fake_gerrit.addEvent(other.getPatchsetCreatedEvent(1))
+
+        self.waitUntilSettled()
+
+        self.assertEqual(other.data['status'], 'NEW')
+        self.assertEqual(other.reported, 1)
+        # project-test2
+        self.assertEqual(self.history[0].result, 'FAILURE')
+
+        # Check nodepool for a held node
+        held_node = None
+        for node in self.fake_nodepool.getNodes():
+            if node['state'] == zuul.model.STATE_HOLD:
+                held_node = node
+                break
+        self.assertIsNone(held_node)
+
+        # And then verify that failed job for the defined change
+        # triggers the autohold
+
+        self.executor_server.failJob('project-test2', change_obj)
+        self.fake_gerrit.addEvent(change_obj.getPatchsetCreatedEvent(1))
+
+        self.waitUntilSettled()
+
+        self.assertEqual(change_obj.data['status'], 'NEW')
+        self.assertEqual(change_obj.reported, 1)
+        # project-test2
+        self.assertEqual(self.history[1].result, 'FAILURE')
+
+        # Check nodepool for a held node
+        held_node = None
+        for node in self.fake_nodepool.getNodes():
+            if node['state'] == zuul.model.STATE_HOLD:
+                held_node = node
+                break
+        self.assertIsNotNone(held_node)
+
+        # Validate node has recorded the failed job
+        if change != "":
+            ref = "refs/changes/%s/%s/.*" % (
+                str(change_obj.number)[-1:], str(change_obj.number)
+            )
+
+        self.assertEqual(
+            held_node['hold_job'],
+            " ".join(['tenant-one',
+                      'review.example.com/org/project',
+                      'project-test2', ref])
+        )
+        self.assertEqual(held_node['comment'], "reason text")
+
+    @simple_layout('layouts/autohold.yaml')
+    def test_autohold_change(self):
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+
+        self._test_autohold_scoped(A, change=A.number, ref="")
+
+    @simple_layout('layouts/autohold.yaml')
+    def test_autohold_ref(self):
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+        ref = A.data['currentPatchSet']['ref']
+        self._test_autohold_scoped(A, change="", ref=ref)
+
+    @simple_layout('layouts/autohold.yaml')
+    def test_autohold_scoping(self):
+        client = zuul.rpcclient.RPCClient('127.0.0.1',
+                                          self.gearman_server.port)
+        self.addCleanup(client.shutdown)
+
+        A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
+
+        # create three autohold requests, scoped to job, change and
+        # a specific ref
+        change = str(A.number)
+        ref = A.data['currentPatchSet']['ref']
+        r1 = client.autohold('tenant-one', 'org/project', 'project-test2',
+                             "", "", "reason text", 1)
+        self.assertTrue(r1)
+        r2 = client.autohold('tenant-one', 'org/project', 'project-test2',
+                             change, "", "reason text", 1)
+        self.assertTrue(r2)
+        r3 = client.autohold('tenant-one', 'org/project', 'project-test2',
+                             "", ref, "reason text", 1)
+        self.assertTrue(r3)
+
+        # Fail 3 jobs for the same change, and verify that the autohold
+        # requests are fullfilled in the expected order: from the most
+        # specific towards the most generic one.
+
+        def _fail_job_and_verify_autohold_request(change_obj, ref_filter):
+            self.executor_server.failJob('project-test2', change_obj)
+            self.fake_gerrit.addEvent(change_obj.getPatchsetCreatedEvent(1))
+
+            self.waitUntilSettled()
+
+            # Check nodepool for a held node
+            held_node = None
+            for node in self.fake_nodepool.getNodes():
+                if node['state'] == zuul.model.STATE_HOLD:
+                    held_node = node
+                    break
+            self.assertIsNotNone(held_node)
+
+            self.assertEqual(
+                held_node['hold_job'],
+                " ".join(['tenant-one',
+                          'review.example.com/org/project',
+                          'project-test2', ref_filter])
+            )
+            self.assertFalse(held_node['_lock'], "Node %s is locked" %
+                             (node['_oid'],))
+            self.fake_nodepool.removeNode(held_node)
+
+        _fail_job_and_verify_autohold_request(A, ref)
+
+        ref = "refs/changes/%s/%s/.*" % (str(change)[-1:], str(change))
+        _fail_job_and_verify_autohold_request(A, ref)
+        _fail_job_and_verify_autohold_request(A, ".*")
+
     @simple_layout('layouts/autohold.yaml')
     def test_autohold_ignores_aborted_jobs(self):
         client = zuul.rpcclient.RPCClient('127.0.0.1',
                                           self.gearman_server.port)
         self.addCleanup(client.shutdown)
         r = client.autohold('tenant-one', 'org/project', 'project-test2',
-                            "reason text", 1)
+                            "", "", "reason text", 1)
         self.assertTrue(r)
 
         self.executor_server.hold_jobs_in_build = True
@@ -1623,7 +1761,7 @@
         self.addCleanup(client.shutdown)
 
         r = client.autohold('tenant-one', 'org/project', 'project-test2',
-                            "reason text", 1)
+                            "", "", "reason text", 1)
         self.assertTrue(r)
 
         autohold_requests = client.autohold_list()
@@ -1632,11 +1770,12 @@
 
         # The single dict key should be a CSV string value
         key = list(autohold_requests.keys())[0]
-        tenant, project, job = key.split(',')
+        tenant, project, job, ref_filter = key.split(',')
 
         self.assertEqual('tenant-one', tenant)
         self.assertIn('org/project', project)
         self.assertEqual('project-test2', job)
+        self.assertEqual(".*", ref_filter)
 
         # Note: the value is converted from set to list by json.
         self.assertEqual([1, "reason text"], autohold_requests[key])

diff --git a/tools/encrypt_secret.py b/tools/encrypt_secret.py
index 4cb1666..45ad68c 100755
--- a/tools/encrypt_secret.py
+++ b/tools/encrypt_secret.py

@@ -26,9 +26,11 @@
 try:
     from urllib.request import Request
     from urllib.request import urlopen
+    from urllib.parse import urlparse
 except ImportError:
     from urllib2 import Request
     from urllib2 import urlopen
+    from urlparse import urlparse
 
 DESCRIPTION = """Encrypt a secret for Zuul.
 
@@ -43,7 +45,6 @@
     parser.add_argument('url',
                         help="The base URL of the zuul server and tenant.  "
                         "E.g., https://zuul.example.com/tenant-name")
-    # TODO(jeblair): Throw a fit if SSL is not used.
     parser.add_argument('project',
                         help="The name of the project.")
     parser.add_argument('--strip', action='store_true', default=False,
@@ -60,6 +61,15 @@
                         "to standard output.")
     args = parser.parse_args()
 
+    # We should not use unencrypted connections for retrieving the public key.
+    # Otherwise our secret can be compromised. The schemes file and https are
+    # considered safe.
+    url = urlparse(args.url)
+    if url.scheme not in ('file', 'https'):
+        sys.stderr.write("WARNING: Retrieving encryption key via an "
+                         "unencrypted connection. Your secret may get "
+                         "compromised.\n")
+
     req = Request("%s/%s.pub" % (args.url.rstrip('/'), args.project))
     pubkey = urlopen(req)
 

diff --git a/zuul/ansible/callback/zuul_stream.py b/zuul/ansible/callback/zuul_stream.py
index df28a57..15b491c 100644
--- a/zuul/ansible/callback/zuul_stream.py
+++ b/zuul/ansible/callback/zuul_stream.py

@@ -367,12 +367,13 @@
                 self._log_message(
                     result, status='MODULE FAILURE',
                     msg=result_dict['module_stderr'])
-        elif (len([key for key in result_dict.keys()
-                   if not key.startswith('_ansible')]) == 1):
+        elif result._task.action == 'debug':
             # this is a debug statement, handle it special
             for key in [k for k in result_dict.keys()
                         if k.startswith('_ansible')]:
                 del result_dict[key]
+            if 'changed' in result_dict.keys():
+                del result_dict['changed']
             keyname = next(iter(result_dict.keys()))
             # If it has msg, that means it was like:
             #

diff --git a/zuul/cmd/client.py b/zuul/cmd/client.py
index ebf59b9..a7b3ef3 100755
--- a/zuul/cmd/client.py
+++ b/zuul/cmd/client.py

@@ -51,6 +51,11 @@
                                   required=True)
         cmd_autohold.add_argument('--job', help='job name',
                                   required=True)
+        cmd_autohold.add_argument('--change',
+                                  help='specific change to hold nodes for',
+                                  required=False, default='')
+        cmd_autohold.add_argument('--ref', help='git ref to hold nodes for',
+                                  required=False, default='')
         cmd_autohold.add_argument('--reason', help='reason for the hold',
                                   required=True)
         cmd_autohold.add_argument('--count',
@@ -173,9 +178,15 @@
     def autohold(self):
         client = zuul.rpcclient.RPCClient(
             self.server, self.port, self.ssl_key, self.ssl_cert, self.ssl_ca)
+        if self.args.change and self.args.ref:
+            print("Change and ref can't be both used for the same request")
+            return False
+
         r = client.autohold(tenant=self.args.tenant,
                             project=self.args.project,
                             job=self.args.job,
+                            change=self.args.change,
+                            ref=self.args.ref,
                             reason=self.args.reason,
                             count=self.args.count)
         return r
@@ -190,14 +201,19 @@
             return True
 
         table = prettytable.PrettyTable(
-            field_names=['Tenant', 'Project', 'Job', 'Count', 'Reason'])
+            field_names=[
+                'Tenant', 'Project', 'Job', 'Ref Filter', 'Count', 'Reason'
+            ])
 
         for key, value in autohold_requests.items():
             # The key comes to us as a CSV string because json doesn't like
             # non-str keys.
-            tenant_name, project_name, job_name = key.split(',')
+            tenant_name, project_name, job_name, ref_filter = key.split(',')
             count, reason = value
-            table.add_row([tenant_name, project_name, job_name, count, reason])
+
+            table.add_row([
+                tenant_name, project_name, job_name, ref_filter, count, reason
+            ])
         print(table)
         return True
 

diff --git a/zuul/driver/gerrit/gerritsource.py b/zuul/driver/gerrit/gerritsource.py
index fdc1ad7..8f3408e 100644
--- a/zuul/driver/gerrit/gerritsource.py
+++ b/zuul/driver/gerrit/gerritsource.py

@@ -141,6 +141,10 @@
         )
         return [f]
 
+    def getRefForChange(self, change):
+        partial = change[-2:]
+        return "refs/changes/%s/%s/.*" % (partial, change)
+
 
 approval = vs.Schema({'username': str,
                       'email': str,

diff --git a/zuul/driver/git/gitsource.py b/zuul/driver/git/gitsource.py
index a7d42be..9f0963d 100644
--- a/zuul/driver/git/gitsource.py
+++ b/zuul/driver/git/gitsource.py

@@ -68,3 +68,6 @@
 
     def getRejectFilters(self, config):
         return []
+
+    def getRefForChange(self, change):
+        raise NotImplemented()

diff --git a/zuul/driver/github/githubconnection.py b/zuul/driver/github/githubconnection.py
index 27d31b4..6dfcdd3 100644
--- a/zuul/driver/github/githubconnection.py
+++ b/zuul/driver/github/githubconnection.py

@@ -722,7 +722,8 @@
             change.newrev = event.newrev
             change.url = self.getGitwebUrl(project, sha=event.newrev)
             change.source_event = event
-            change.files = self.getPushedFileNames(event)
+            if hasattr(event, 'commits'):
+                change.files = self.getPushedFileNames(event)
         return change
 
     def _getChange(self, project, number, patchset=None, refresh=False):

diff --git a/zuul/driver/github/githubsource.py b/zuul/driver/github/githubsource.py
index 33f8f7c..6f9b14d 100644
--- a/zuul/driver/github/githubsource.py
+++ b/zuul/driver/github/githubsource.py

@@ -144,6 +144,9 @@
         )
         return [f]
 
+    def getRefForChange(self, change):
+        return "refs/pull/%s/head" % change
+
 
 review = v.Schema({'username': str,
                    'email': str,

diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index a831a53..53ef173 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py

@@ -777,7 +777,16 @@
         return data
 
     def doMergeChanges(self, merger, items, repo_state):
-        ret = merger.mergeChanges(items, repo_state=repo_state)
+        try:
+            ret = merger.mergeChanges(items, repo_state=repo_state)
+        except ValueError as e:
+            # Return ABORTED so that we'll try again. At this point all of
+            # the refs we're trying to merge should be valid refs. If we
+            # can't fetch them, it should resolve itself.
+            self.log.exception("Could not fetch refs to merge from remote")
+            result = dict(result='ABORTED')
+            self.job.sendWorkComplete(json.dumps(result))
+            return False
         if not ret:  # merge conflict
             result = dict(result='MERGER_FAILURE')
             if self.executor_server.statsd:
@@ -844,6 +853,13 @@
         repo.checkout(selected_ref)
         return selected_ref
 
+    def getAnsibleTimeout(self, start, timeout):
+        if timeout is not None:
+            now = time.time()
+            elapsed = now - start
+            timeout = timeout - elapsed
+        return timeout
+
     def runPlaybooks(self, args):
         result = None
 
@@ -861,10 +877,15 @@
         pre_failed = False
         success = False
         self.started = True
+        time_started = time.time()
+        # timeout value is total job timeout or put another way
+        # the cummulative time that pre, run, and post can consume.
+        job_timeout = args['timeout']
         for index, playbook in enumerate(self.jobdir.pre_playbooks):
             # TODOv3(pabelanger): Implement pre-run timeout setting.
+            ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout)
             pre_status, pre_code = self.runAnsiblePlaybook(
-                playbook, args['timeout'], phase='pre', index=index)
+                playbook, ansible_timeout, phase='pre', index=index)
             if pre_status != self.RESULT_NORMAL or pre_code != 0:
                 # These should really never fail, so return None and have
                 # zuul try again
@@ -872,8 +893,9 @@
                 break
 
         if not pre_failed:
+            ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout)
             job_status, job_code = self.runAnsiblePlaybook(
-                self.jobdir.playbook, args['timeout'], phase='run')
+                self.jobdir.playbook, ansible_timeout, phase='run')
             if job_status == self.RESULT_ABORTED:
                 return 'ABORTED'
             elif job_status == self.RESULT_TIMED_OUT:
@@ -894,8 +916,9 @@
 
         for index, playbook in enumerate(self.jobdir.post_playbooks):
             # TODOv3(pabelanger): Implement post-run timeout setting.
+            ansible_timeout = self.getAnsibleTimeout(time_started, job_timeout)
             post_status, post_code = self.runAnsiblePlaybook(
-                playbook, args['timeout'], success, phase='post', index=index)
+                playbook, ansible_timeout, success, phase='post', index=index)
             if post_status == self.RESULT_ABORTED:
                 return 'ABORTED'
             if post_status != self.RESULT_NORMAL or post_code != 0:
@@ -1660,7 +1683,7 @@
                                             'load_multiplier', '2.5'))
         self.max_load_avg = multiprocessing.cpu_count() * load_multiplier
         self.max_starting_builds = self.max_load_avg * 2
-        self.min_starting_builds = 4
+        self.min_starting_builds = max(int(multiprocessing.cpu_count() / 2), 1)
         self.min_avail_mem = float(get_default(self.config, 'executor',
                                                'min_avail_mem', '5.0'))
         self.accepting_work = False

diff --git a/zuul/merger/merger.py b/zuul/merger/merger.py
index 07f3e69..5e102b4 100644
--- a/zuul/merger/merger.py
+++ b/zuul/merger/merger.py

@@ -261,14 +261,6 @@
         repo.git.checkout(ref)
         return repo.head.commit
 
-    def checkoutLocalBranch(self, branch):
-        # TODO(jeblair): retire in favor of checkout
-        repo = self.createRepoObject()
-        # Perform a hard reset before checking out so that we clean up
-        # anything that might be left over from a merge.
-        reset_repo_to_head(repo)
-        repo.heads[branch].checkout()
-
     def cherryPick(self, ref):
         repo = self.createRepoObject()
         self.log.debug("Cherry-picking %s" % ref)

diff --git a/zuul/rpcclient.py b/zuul/rpcclient.py
index 8f2e5dc..a947ed0 100644
--- a/zuul/rpcclient.py
+++ b/zuul/rpcclient.py

@@ -48,10 +48,12 @@
         self.log.debug("Job complete, success: %s" % (not job.failure))
         return job
 
-    def autohold(self, tenant, project, job, reason, count):
+    def autohold(self, tenant, project, job, change, ref, reason, count):
         data = {'tenant': tenant,
                 'project': project,
                 'job': job,
+                'change': change,
+                'ref': ref,
                 'reason': reason,
                 'count': count}
         return not self.submitJob('zuul:autohold', data).failure

diff --git a/zuul/rpclistener.py b/zuul/rpclistener.py
index e5016df..f3f55f6 100644
--- a/zuul/rpclistener.py
+++ b/zuul/rpclistener.py

@@ -150,7 +150,20 @@
             job.sendWorkException(error.encode('utf8'))
             return
 
+        if args['change'] and args['ref']:
+            job.sendWorkException("Change and ref can't be both used "
+                                  "for the same request")
+
+        if args['change']:
+            # Convert change into ref based on zuul connection
+            ref_filter = project.source.getRefForChange(args['change'])
+        elif args['ref']:
+            ref_filter = "%s" % args['ref']
+        else:
+            ref_filter = ".*"
+
         params['job_name'] = args['job']
+        params['ref_filter'] = ref_filter
         params['reason'] = args['reason']
 
         if args['count'] < 0:

diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index de42aa3..c06497d 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py

@@ -19,6 +19,7 @@
 import logging
 import os
 import pickle
+import re
 import queue
 import socket
 import sys
@@ -436,8 +437,9 @@
         self.last_reconfigured = int(time.time())
         # TODOv3(jeblair): reconfigure time should be per-tenant
 
-    def autohold(self, tenant_name, project_name, job_name, reason, count):
-        key = (tenant_name, project_name, job_name)
+    def autohold(self, tenant_name, project_name, job_name, ref_filter,
+                 reason, count):
+        key = (tenant_name, project_name, job_name, ref_filter)
         if count == 0 and key in self.autohold_requests:
             self.log.debug("Removing autohold for %s", key)
             del self.autohold_requests[key]
@@ -973,6 +975,84 @@
             self.log.exception("Exception estimating build time:")
         pipeline.manager.onBuildStarted(event.build)
 
+    def _getAutoholdRequestKey(self, build):
+        change = build.build_set.item.change
+
+        autohold_key_base = (build.pipeline.layout.tenant.name,
+                             change.project.canonical_name,
+                             build.job.name)
+
+        class Scope(object):
+            """Enum defining a precedence/priority of autohold requests.
+
+            Autohold requests for specific refs should be fulfilled first,
+            before those for changes, and generic jobs.
+
+            Matching algorithm goes over all existing autohold requests, and
+            returns one with the highest number (in case of duplicated
+            requests the last one wins).
+            """
+            NONE = 0
+            JOB = 1
+            CHANGE = 2
+            REF = 3
+
+        def autohold_key_base_issubset(base, request_key):
+            """check whether the given key is a subset of the build key"""
+            index = 0
+            base_len = len(base)
+            while index < base_len:
+                if base[index] != request_key[index]:
+                    return False
+                index += 1
+            return True
+
+        # Do a partial match of the autohold key against all autohold
+        # requests, ignoring the last element of the key (ref filter),
+        # and finally do a regex match between ref filter from
+        # the autohold request and the build's change ref to check
+        # if it matches. Lastly, make sure that we match the most
+        # specific autohold request by comparing "scopes"
+        # of requests - the most specific is selected.
+        autohold_key = None
+        scope = Scope.NONE
+        for request in self.autohold_requests:
+            ref_filter = request[-1]
+            if not autohold_key_base_issubset(autohold_key_base, request) \
+                or not re.match(ref_filter, change.ref):
+                continue
+
+            if ref_filter == ".*":
+                candidate_scope = Scope.JOB
+            elif ref_filter.endswith(".*"):
+                candidate_scope = Scope.CHANGE
+            else:
+                candidate_scope = Scope.REF
+
+            if candidate_scope > scope:
+                scope = candidate_scope
+                autohold_key = request
+
+        return autohold_key
+
+    def _processAutohold(self, build):
+
+        # We explicitly only want to hold nodes for jobs if they have
+        # failed and have an autohold request.
+        if build.result != "FAILURE":
+            return
+
+        autohold_key = self._getAutoholdRequestKey(build)
+        try:
+            self.nodepool.holdNodeSet(build.nodeset, autohold_key)
+        except Exception:
+            self.log.exception("Unable to process autohold for %s:",
+                               autohold_key)
+            if autohold_key in self.autohold_requests:
+                self.log.debug("Removing autohold %s due to exception",
+                               autohold_key)
+                del self.autohold_requests[autohold_key]
+
     def _doBuildCompletedEvent(self, event):
         build = event.build
 
@@ -980,27 +1060,10 @@
         # to pass this on to the pipeline manager, make sure we return
         # the nodes to nodepool.
         try:
-            nodeset = build.nodeset
-            autohold_key = (build.pipeline.layout.tenant.name,
-                            build.build_set.item.change.project.canonical_name,
-                            build.job.name)
-            if (build.result == "FAILURE" and
-                autohold_key in self.autohold_requests):
-                # We explicitly only want to hold nodes for jobs if they have
-                # failed and have an autohold request.
-                try:
-                    self.nodepool.holdNodeSet(nodeset, autohold_key)
-                except Exception:
-                    self.log.exception("Unable to process autohold for %s:",
-                                       autohold_key)
-                    if autohold_key in self.autohold_requests:
-                        self.log.debug("Removing autohold %s due to exception",
-                                       autohold_key)
-                        del self.autohold_requests[autohold_key]
-
-            self.nodepool.returnNodeSet(nodeset)
+            self._processAutohold(build)
+            self.nodepool.returnNodeSet(build.nodeset)
         except Exception:
-            self.log.exception("Unable to return nodeset %s" % (nodeset,))
+            self.log.exception("Unable to return nodeset %s" % build.nodeset)
 
         if build.build_set is not build.build_set.item.current_build_set:
             self.log.debug("Build %s is not in the current build set" %
commit	985bb4467931bf44167c7bc70f4de93b26641943	[log] [tgz]
author	Zuul <zuul@review.openstack.org>	Tue Feb 13 21:00:55 2018 +0000
committer	Gerrit Code Review <review@openstack.org>	Tue Feb 13 21:00:55 2018 +0000
tree	60edb44a2665faaaaa5b6c7fd4bb584c4044940e
parent	9bae3c0b18245f6d732cdae05b208cb05320e333 [diff]
parent	12c51791c21abe79906fca7953779b2fa50315af [diff]