Merge "Add memory awareness to system load governor"
diff --git a/doc/source/admin/components.rst b/doc/source/admin/components.rst
index d6b0984..2e18b51 100644
--- a/doc/source/admin/components.rst
+++ b/doc/source/admin/components.rst
@@ -575,6 +575,16 @@
The executor will observe system load and determine whether
to accept more jobs every 30 seconds.
+ .. attr:: min_avail_mem
+ :default: 5.0
+
+ This is the minimum percentage of system RAM available. The
+ executor will stop accepting more than 1 job at a time until
+ more memory is available. The available memory percentage is
+ calculated from the total available memory divided by the
+ total real memory multiplied by 100. Buffers and cache are
+ considered available in the calculation.
+
.. attr:: hostname
:default: hostname of the server
diff --git a/requirements.txt b/requirements.txt
index 39a2b02..3ab5850 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,3 +27,4 @@
iso8601
aiohttp
uvloop;python_version>='3.5'
+psutil
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index e72fc13..d2c04ac 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -18,6 +18,7 @@
import logging
import multiprocessing
import os
+import psutil
import shutil
import signal
import shlex
@@ -1952,6 +1953,7 @@
''' Apply some heuristics to decide whether or not we should
be askign for more jobs '''
load_avg = os.getloadavg()[0]
+ avail_mem_pct = 100.0 - psutil.virtual_memory().percent
if self.accepting_work:
# Don't unregister if we don't have any active jobs.
if load_avg > self.max_load_avg and self.job_workers:
@@ -1959,10 +1961,19 @@
"Unregistering due to high system load {} > {}".format(
load_avg, self.max_load_avg))
self.unregister_work()
- elif load_avg <= self.max_load_avg:
+ elif avail_mem_pct < self.min_avail_mem:
+ self.log.info(
+ "Unregistering due to low memory {:3.1f}% < {}".format(
+ avail_mem_pct, self.min_avail_mem))
+ self.unregister_work()
+ elif (load_avg <= self.max_load_avg and
+ avail_mem_pct >= self.min_avail_mem):
self.log.info(
- "Re-registering as load is within limits {} <= {}".format(
- load_avg, self.max_load_avg))
+ "Re-registering as job is within limits "
+ "{} <= {} {:3.1f}% <= {}".format(load_avg,
+ self.max_load_avg,
+ avail_mem_pct,
+ self.min_avail_mem))
self.register_work()
if self.statsd:
base_key = 'zuul.executor.%s' % self.hostname