Report the per-job build wait time to graphite

Knowing how long a job waits to build in aggregate can give useful
insights into the performance and capacity of the build system. This
change also uses the node labels sent back from the gearman worker to
submit metrics within that context.

Change-Id: I173251c8b1569755124b7cb1a48b6274bf38c94b
Depends-On: Ibca938fcf8a65facd7e39dab4eb994dfc637722a
diff --git a/tests/base.py b/tests/base.py
index 535bb7f..fd27a89 100755
--- a/tests/base.py
+++ b/tests/base.py
@@ -598,6 +598,8 @@
             result = 'RUN_ERROR'
         else:
             data['result'] = result
+            data['node_labels'] = ['bare-necessities']
+            data['node_name'] = 'foo'
             work_fail = False
 
         changes = None
diff --git a/zuul/cmd/client.py b/zuul/cmd/client.py
index bc2c152..10f2854 100644
--- a/zuul/cmd/client.py
+++ b/zuul/cmd/client.py
@@ -232,6 +232,12 @@
             'number': {
                 'title': 'Number'
             },
+            'node_labels': {
+                'title': 'Node Labels'
+            },
+            'node_name': {
+                'title': 'Node Name'
+            },
             'worker.name': {
                 'title': 'Worker'
             },
@@ -245,7 +251,7 @@
             'worker.fqdn': {
                 'title': 'Worker Domain'
             },
-            'worker.progam': {
+            'worker.program': {
                 'title': 'Worker Program'
             },
             'worker.version': {
diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py
index 57ac5ca..6c8b689 100644
--- a/zuul/launcher/gearman.py
+++ b/zuul/launcher/gearman.py
@@ -423,9 +423,11 @@
 
         build = self.builds.get(job.unique)
         if build:
+            data = getJobData(job)
+            build.node_labels = data.get('node_labels')
+            build.node_name = data.get('node_name')
             if not build.canceled:
                 if result is None:
-                    data = getJobData(job)
                     result = data.get('result')
                 if result is None:
                     build.retry = True
diff --git a/zuul/model.py b/zuul/model.py
index 4b907c3..e852033 100644
--- a/zuul/model.py
+++ b/zuul/model.py
@@ -578,6 +578,8 @@
         self.retry = False
         self.parameters = {}
         self.worker = Worker()
+        self.node_labels = []
+        self.node_name = None
 
     def __repr__(self):
         return ('<Build %s of %s on %s>' %
@@ -799,7 +801,9 @@
                 'canceled': build.canceled if build else None,
                 'retry': build.retry if build else None,
                 'number': build.number if build else None,
-                'worker': worker
+                'node_labels': build.node_labels if build else [],
+                'node_name': build.node_name if build else None,
+                'worker': worker,
             })
 
         if self.pipeline.haveAllJobsStarted(self):
diff --git a/zuul/scheduler.py b/zuul/scheduler.py
index 9620036..f3ecc03 100644
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@@ -533,6 +533,16 @@
     def onBuildStarted(self, build):
         self.log.debug("Adding start event for build: %s" % build)
         build.start_time = time.time()
+        try:
+            if statsd and build.pipeline:
+                jobname = build.job.name.replace('.', '_')
+                key = 'zuul.pipeline.%s.job.%s.wait_time' % (
+                    build.pipeline.name, jobname)
+                dt = int((build.start_time - build.launch_time) * 1000)
+                statsd.timing(key, dt)
+                statsd.incr(key)
+        except:
+            self.log.exception("Exception reporting runtime stats")
         event = BuildStartedEvent(build)
         self.result_event_queue.put(event)
         self.wake_event.set()
@@ -544,14 +554,24 @@
         try:
             if statsd and build.pipeline:
                 jobname = build.job.name.replace('.', '_')
+                key = 'zuul.pipeline.%s.all_jobs' % build.pipeline.name
+                for label in build.node_labels:
+                    # Jenkins includes the node name in its list of labels, so
+                    # we filter it out here, since that is not statistically
+                    # interesting.
+                    if label == build.node_name:
+                        continue
+                    dt = int((build.start_time - build.launch_time) * 1000)
+                    key = 'zuul.node_type.%s.job.%s.wait_time' % (
+                        label, jobname)
+                    statsd.timing(key, dt)
+                    statsd.incr(key)
                 key = 'zuul.pipeline.%s.job.%s.%s' % (build.pipeline.name,
                                                       jobname, build.result)
                 if build.result in ['SUCCESS', 'FAILURE'] and build.start_time:
                     dt = int((build.end_time - build.start_time) * 1000)
                     statsd.timing(key, dt)
                 statsd.incr(key)
-                key = 'zuul.pipeline.%s.all_jobs' % build.pipeline.name
-                statsd.incr(key)
         except:
             self.log.exception("Exception reporting runtime stats")
         event = BuildCompletedEvent(build)