Merger: retry network operations

Try cloning and fetching 3 times, 30 seconds apart, to add some
robustness.

Change-Id: I4f113a0d1313ea4485086f62514825ec980584a6
diff --git a/tests/fixtures/git_fetch_error.sh b/tests/fixtures/git_fetch_error.sh
new file mode 100755
index 0000000..49c568c
--- /dev/null
+++ b/tests/fixtures/git_fetch_error.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+echo $*
+case "$1" in
+    fetch)
+	if [ -f ./stamp1 ]; then
+	    touch ./stamp2
+	    exit 0
+	fi
+	touch ./stamp1
+	exit 1
+	;;
+    version)
+        echo "git version 1.0.0"
+        exit 0
+        ;;
+esac
diff --git a/tests/unit/test_merger_repo.py b/tests/unit/test_merger_repo.py
index ec30a2b..fb2f199 100644
--- a/tests/unit/test_merger_repo.py
+++ b/tests/unit/test_merger_repo.py
@@ -82,7 +82,7 @@
                    os.path.join(FIXTURE_DIR, 'fake_git.sh'))
         work_repo = Repo(parent_path, self.workspace_root,
                          'none@example.org', 'User Name', '0', '0',
-                         git_timeout=0.001)
+                         git_timeout=0.001, retry_attempts=1)
         # TODO: have the merger and repo classes catch fewer
         # exceptions, including this one on initialization.  For the
         # test, we try cloning again.
@@ -93,10 +93,26 @@
     def test_fetch_timeout(self):
         parent_path = os.path.join(self.upstream_root, 'org/project1')
         work_repo = Repo(parent_path, self.workspace_root,
-                         'none@example.org', 'User Name', '0', '0')
+                         'none@example.org', 'User Name', '0', '0',
+                         retry_attempts=1)
         work_repo.git_timeout = 0.001
         self.patch(git.Git, 'GIT_PYTHON_GIT_EXECUTABLE',
                    os.path.join(FIXTURE_DIR, 'fake_git.sh'))
         with testtools.ExpectedException(git.exc.GitCommandError,
                                          '.*exit code\(-9\)'):
             work_repo.update()
+
+    def test_fetch_retry(self):
+        parent_path = os.path.join(self.upstream_root, 'org/project1')
+        work_repo = Repo(parent_path, self.workspace_root,
+                         'none@example.org', 'User Name', '0', '0',
+                         retry_interval=1)
+        self.patch(git.Git, 'GIT_PYTHON_GIT_EXECUTABLE',
+                   os.path.join(FIXTURE_DIR, 'git_fetch_error.sh'))
+        work_repo.update()
+        # This is created on the first fetch
+        self.assertTrue(os.path.exists(os.path.join(
+            self.workspace_root, 'stamp1')))
+        # This is created on the second fetch
+        self.assertTrue(os.path.exists(os.path.join(
+            self.workspace_root, 'stamp2')))
diff --git a/zuul/merger/merger.py b/zuul/merger/merger.py
index 035dbf5..c221478 100644
--- a/zuul/merger/merger.py
+++ b/zuul/merger/merger.py
@@ -17,6 +17,7 @@
 import logging
 import os
 import shutil
+import time
 
 import git
 import gitdb
@@ -59,7 +60,8 @@
 
 class Repo(object):
     def __init__(self, remote, local, email, username, speed_limit, speed_time,
-                 sshkey=None, cache_path=None, logger=None, git_timeout=300):
+                 sshkey=None, cache_path=None, logger=None, git_timeout=300,
+                 retry_attempts=3, retry_interval=30):
         if logger is None:
             self.log = logging.getLogger("zuul.Repo")
         else:
@@ -78,6 +80,8 @@
         self.username = username
         self.cache_path = cache_path
         self._initialized = False
+        self.retry_attempts = retry_attempts
+        self.retry_interval = retry_interval
         try:
             self._ensure_cloned()
         except Exception:
@@ -123,14 +127,37 @@
     def _git_clone(self, url):
         mygit = git.cmd.Git(os.getcwd())
         mygit.update_environment(**self.env)
-        with timeout_handler(self.local_path):
-            mygit.clone(git.cmd.Git.polish_url(url), self.local_path,
-                        kill_after_timeout=self.git_timeout)
+
+        for attempt in range(1, self.retry_attempts + 1):
+            try:
+                with timeout_handler(self.local_path):
+                    mygit.clone(git.cmd.Git.polish_url(url), self.local_path,
+                                kill_after_timeout=self.git_timeout)
+                break
+            except Exception as e:
+                if attempt < self.retry_attempts:
+                    time.sleep(self.retry_interval)
+                    self.log.warning("Retry %s: Clone %s" % (
+                        attempt, self.local_path))
+                else:
+                    raise
 
     def _git_fetch(self, repo, remote, ref=None, **kwargs):
-        with timeout_handler(self.local_path):
-            repo.git.fetch(remote, ref, kill_after_timeout=self.git_timeout,
-                           **kwargs)
+        for attempt in range(1, self.retry_attempts + 1):
+            try:
+                with timeout_handler(self.local_path):
+                    repo.git.fetch(remote, ref,
+                                   kill_after_timeout=self.git_timeout,
+                                   **kwargs)
+                break
+            except Exception as e:
+                if attempt < self.retry_attempts:
+                    time.sleep(self.retry_interval)
+                    self.log.exception("Retry %s: Fetch %s %s %s" % (
+                        attempt, self.local_path, remote, ref))
+                    self._ensure_cloned()
+                else:
+                    raise
 
     def createRepoObject(self):
         self._ensure_cloned()