build: download code from github using archive API

A new python script scripts/download.py is added to fetch tarballs using
GitHub archive API [1], then repack in a reproducible way same as the
current DownloadMethod/git

GitHub imposes a 60 reqs/hour rate limit on unauthenticated API
access[2].  This affects fetching commit date for feeding tar --mtime=
argument.  However, observation indicates that archive download is NOT
subject to this limit at the moment.  In the rare cases where download
fails because of this, we will falback to using DownloadMethod/git

The missing piece in the GitHub API is that it cannot provide in the
tarball dependent submodules's source code.  In that case, the
implementation will also fallback to using DownloadMethod/git

 [1] Get archive link, https://developer.github.com/v3/repos/contents/#get-archive-link
 [2] Rate limiting, https://developer.github.com/v3/#rate-limiting

v2 <- v1:

 - allow passing multiple urls with --urls argument
 - add commit ts cache.  can be helpful on retry

Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
This commit is contained in:
Yousong Zhou 2018-02-11 17:42:22 +08:00
parent 3ce11588f6
commit 75ab064d2b
2 changed files with 464 additions and 31 deletions

View file

@ -21,23 +21,7 @@ DOWNLOAD_RDEP=$(STAMP_PREPARED) $(HOST_STAMP_PREPARED)
# Try to guess the download method from the URL
define dl_method
$(strip \
$(if $(2),$(2), \
$(if $(filter @APACHE/% @GITHUB/% @GNOME/% @GNU/% @KERNEL/% @SF/% @SAVANNAH/% ftp://% http://% https://% file://%,$(1)),default, \
$(if $(filter git://%,$(1)),git, \
$(if $(filter svn://%,$(1)),svn, \
$(if $(filter cvs://%,$(1)),cvs, \
$(if $(filter hg://%,$(1)),hg, \
$(if $(filter sftp://%,$(1)),bzr, \
unknown \
) \
) \
) \
) \
) \
) \
) \
)
$(shell $(SCRIPT_DIR)/download.py dl_method --url $(foreach url,$(1),"$(url)") --proto="$(2)")
endef
# code for creating tarballs from cvs/svn/git/bzr/hg/darcs checkouts - useful for mirror support
@ -56,6 +40,10 @@ ifdef CHECK
check_escape=$(subst ','\'',$(1))
#')
# $(1): suffix of the F_, C_ variables, e.g. hash_deprecated, hash_mismatch, etc.
# $(2): filename
# $(3): expected hash value
# $(4): hash var name: MD5SUM, HASH
check_warn_nofix = $(info $(shell printf "$(_R)WARNING: %s$(_N)" '$(call check_escape,$(call C_$(1),$(2),$(3),$(4)))'))
ifndef FIXUP
check_warn = $(check_warn_nofix)
@ -71,6 +59,9 @@ F_hash_mismatch = $(F_hash_deprecated)
F_hash_missing = $(SCRIPT_DIR)/fixup-makefile.pl $(CURDIR)/Makefile add-hash $(3) $(call gen_sha256sum,$(1))
endif
# $(1): filename
# $(2): expected hash value
# $(3): hash var name: MD5SUM, HASH
C_download_missing = $(1) is missing, please run make download before re-running this check
C_hash_mismatch = $(3) does not match $(1) hash $(call gen_sha256sum,$(1))
C_hash_deprecated = $(3) uses deprecated hash, set to $(call gen_sha256sum,$(1))
@ -116,6 +107,9 @@ define DownloadMethod/default
)
endef
# $(1): "check"
# $(2): "PKG_" if <name> as in Download/<name> is "default", otherwise "Download/<name>:"
# $(3): shell command sequence to do the download
define wrap_mirror
$(if $(if $(MIRROR),$(filter-out x,$(MIRROR_HASH))),$(SCRIPT_DIR)/download.pl "$(DL_DIR)" "$(FILE)" "$(MIRROR_HASH)" "" || ( $(3) ),$(3)) \
$(if $(filter check,$(1)), \
@ -159,23 +153,41 @@ endef
define DownloadMethod/git
$(call wrap_mirror,$(1),$(2), \
echo "Checking out files from the git repository..."; \
mkdir -p $(TMP_DIR)/dl && \
cd $(TMP_DIR)/dl && \
rm -rf $(SUBDIR) && \
[ \! -d $(SUBDIR) ] && \
git clone $(OPTS) $(URL) $(SUBDIR) && \
(cd $(SUBDIR) && git checkout $(VERSION) && \
git submodule update --init --recursive) && \
echo "Packing checkout..." && \
export TAR_TIMESTAMP=`cd $(SUBDIR) && git log -1 --format='@%ct'` && \
rm -rf $(SUBDIR)/.git && \
$(call dl_tar_pack,$(TMP_DIR)/dl/$(FILE),$(SUBDIR)) && \
mv $(TMP_DIR)/dl/$(FILE) $(DL_DIR)/ && \
rm -rf $(SUBDIR); \
$(call DownloadMethod/git-raw) \
)
endef
define DownloadMethod/github-tarball
$(call wrap_mirror,$(1),$(2), \
$(SCRIPT_DIR)/download.py dl \
--dl-dir="$(DL_DIR)" \
--url $(foreach url,$(URL),"$(url)") \
--proto="$(PROTO)" \
--version="$(VERSION)" \
--subdir="$(SUBDIR)" \
--source="$(FILE)" \
|| ( $(call DownloadMethod/git-raw) ); \
)
endef
# Only intends to be called as a submethod from other DownloadMethod
define DownloadMethod/git-raw
echo "Checking out files from the git repository..."; \
mkdir -p $(TMP_DIR)/dl && \
cd $(TMP_DIR)/dl && \
rm -rf $(SUBDIR) && \
[ \! -d $(SUBDIR) ] && \
git clone $(OPTS) $(URL) $(SUBDIR) && \
(cd $(SUBDIR) && git checkout $(VERSION) && \
git submodule update --init --recursive) && \
echo "Packing checkout..." && \
export TAR_TIMESTAMP=`cd $(SUBDIR) && git log -1 --format='@%ct'` && \
rm -rf $(SUBDIR)/.git && \
$(call dl_tar_pack,$(TMP_DIR)/dl/$(FILE),$(SUBDIR)) && \
mv $(TMP_DIR)/dl/$(FILE) $(DL_DIR)/ && \
rm -rf $(SUBDIR);
endef
define DownloadMethod/bzr
$(call wrap_mirror,$(1),$(2), \
echo "Checking out files from the bzr repository..."; \

421
scripts/download.py Executable file
View file

@ -0,0 +1,421 @@
#!/usr/bin/env python
#
# Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
#
# This is free software, licensed under the GNU General Public License v2.
# See /LICENSE for more information.
import argparse
import calendar
import datetime
import errno
import fcntl
import json
import os
import os.path
import re
import shutil
import ssl
import subprocess
import sys
import time
import urllib2
TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
TMPDIR_DL = os.path.join(TMPDIR, 'dl')
DOWNLOAD_METHODS = []
class PathException(Exception): pass
class DownloadException(Exception): pass
class Path(object):
"""Context class for preparing and cleaning up directories.
If ``path`` ``isdir``, then it will be created on context enter.
If ``keep`` is True, then ``path`` will NOT be removed on context exit
"""
def __init__(self, path, isdir=True, keep=False):
self.path = path
self.isdir = isdir
self.keep = keep
def __enter__(self):
if self.isdir:
self.mkdir_all(self.path)
return self
def __exit__(self, exc_type, exc_value, traceback):
if not self.keep:
self.rm_all(self.path)
@staticmethod
def mkdir_all(path):
"""Same as mkdir -p."""
names = os.path.split(path)
p = ''
for name in names:
p = os.path.join(p, name)
Path._mkdir(p)
@staticmethod
def _rmdir_all(dir_):
names = Path._listdir(dir_)
for name in names:
p = os.path.join(dir_, name)
if os.path.isdir(p):
Path._rmdir_all(p)
else:
Path._remove(p)
Path._rmdir(dir_)
@staticmethod
def _mkdir(path):
Path._os_func(os.mkdir, path, errno.EEXIST)
@staticmethod
def _rmdir(path):
Path._os_func(os.rmdir, path, errno.ENOENT)
@staticmethod
def _remove(path):
Path._os_func(os.remove, path, errno.ENOENT)
@staticmethod
def _listdir(path):
return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
@staticmethod
def _os_func(func, path, errno, default=None):
"""Call func(path) in an idempotent way.
On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
return ``default``, otherwise, re-raise
"""
try:
return func(path)
except OSError as e:
if e.errno == errno:
return default
else:
raise
@staticmethod
def rm_all(path):
"""Same as rm -r."""
if os.path.isdir(path):
Path._rmdir_all(path)
else:
Path._remove(path)
@staticmethod
def untar(path, into=None):
"""Extract tarball at ``path`` into subdir ``into``.
return subdir name if and only if there exists one, otherwise raise PathException
"""
args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
dirs = os.listdir(into)
if len(dirs) == 1:
return dirs[0]
else:
raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
@staticmethod
def tar(path, subdir, into=None, ts=None):
"""Pack ``path`` into tarball ``into``."""
# --sort=name requires a recent build of GNU tar
args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name']
args += ['-C', path, '-cf', into, subdir]
envs = os.environ.copy()
if ts is not None:
args.append('--mtime=@%d' % ts)
if into.endswith('.xz'):
envs['XZ_OPT'] = '-7e'
args.append('-J')
elif into.endswith('.bz2'):
args.append('-j')
elif into.endswith('.gz'):
args.append('-z')
envs['GZIP'] = '-n'
else:
raise PathException('unknown compression type %s' % into)
subprocess.check_call(args, env=envs)
class GitHubCommitTsCache(object):
__cachef = 'github.commit.ts.cache'
__cachen = 2048
def __init__(self):
Path.mkdir_all(TMPDIR_DL)
self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
self.cache = {}
def get(self, k):
"""Get timestamp with key ``k``."""
fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
with os.fdopen(fileno) as fin:
try:
fcntl.lockf(fileno, fcntl.LOCK_SH)
self._cache_init(fin)
if k in self.cache:
ts = self.cache[k][0]
return ts
finally:
fcntl.lockf(fileno, fcntl.LOCK_UN)
return None
def set(self, k, v):
"""Update timestamp with ``k``."""
fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
with os.fdopen(fileno, 'wb+') as f:
try:
fcntl.lockf(fileno, fcntl.LOCK_EX)
self._cache_init(f)
self.cache[k] = (v, int(time.time()))
self._cache_flush(f)
finally:
fcntl.lockf(fileno, fcntl.LOCK_UN)
def _cache_init(self, fin):
for line in fin:
k, ts, updated = line.split()
ts = int(ts)
updated = int(updated)
self.cache[k] = (ts, updated)
def _cache_flush(self, fout):
cache = sorted(self.cache.iteritems(), cmp=lambda a, b: b[1][1] - a[1][1])
cache = cache[:self.__cachen]
self.cache = {}
os.ftruncate(fout.fileno(), 0)
fout.seek(0, os.SEEK_SET)
for k, ent in cache:
ts = ent[0]
updated = ent[1]
line = '{0} {1} {2}\n'.format(k, ts, updated)
fout.write(line)
class DownloadMethod(object):
"""Base class of all download method."""
def __init__(self, args):
self.args = args
self.urls = args.urls
self.url = self.urls[0]
self.dl_dir = args.dl_dir
@classmethod
def resolve(cls, args):
"""Resolve download method to use.
return instance of subclass of DownloadMethod
"""
for c in DOWNLOAD_METHODS:
if c.match(args):
return c(args)
@staticmethod
def match(args):
"""Return True if it can do the download."""
return NotImplemented
def download(self):
"""Do the download and put it into the download dir."""
return NotImplemented
class DownloadMethodGitHubTarball(DownloadMethod):
"""Download and repack archive tarabll from GitHub."""
__repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
def __init__(self, args):
super(DownloadMethodGitHubTarball, self).__init__(args)
self._init_owner_repo()
self.version = args.version
self.subdir = args.subdir
self.source = args.source
self.commit_ts = None # lazy load commit timestamp
self.commit_ts_cache = GitHubCommitTsCache()
self.name = 'github-tarball'
@staticmethod
def match(args):
"""Match if it's a GitHub clone url."""
url = args.urls[0]
proto = args.proto
if proto == 'git' and isinstance(url, basestring) \
and (url.startswith('https://github.com/') or url.startswith('git://github.com/')):
return True
return False
def download(self):
"""Download and repack GitHub archive tarball."""
self._init_commit_ts()
with Path(TMPDIR_DL, keep=True) as dir_dl:
# fetch tarball from GitHub
tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
with Path(tarball_path, isdir=False):
self._fetch(tarball_path)
# unpack
d = os.path.join(dir_dl.path, self.subdir + '.untar')
with Path(d) as dir_untar:
tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
dir0 = os.path.join(dir_untar.path, tarball_prefix)
dir1 = os.path.join(dir_untar.path, self.subdir)
# submodules check
if self._has_submodule(dir0):
raise DownloadException('unable to fetch submodules\' source code')
# rename subdir
os.rename(dir0, dir1)
# repack
into=os.path.join(TMPDIR_DL, self.source)
Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
# move to target location
file1 = os.path.join(self.dl_dir, self.source)
if into != file1:
shutil.move(into, file1)
def _has_submodule(self, dir_):
m = os.path.join(dir_, '.gitmodules')
try:
st = os.stat(m)
return st.st_size > 0
except OSError as e:
return e.errno != errno.ENOENT
def _init_owner_repo(self):
url = self.url
m = self.__repo_url_regex.search(url)
if m is None:
raise DownloadException('invalid github url: %s' % url)
owner = m.group('owner')
repo = m.group('repo')
if repo.endswith('.git'):
repo = repo[:-4]
self.owner = owner
self.repo = repo
def _init_commit_ts(self):
if self.commit_ts is not None:
return
url = self._make_repo_url_path('commits', self.version)
ct = self.commit_ts_cache.get(url)
if ct is not None:
self.commit_ts = ct
return
resp = self._make_request(url)
data = resp.read()
data = json.loads(data)
date = data['commit']['committer']['date']
date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
date = date.timetuple()
ct = calendar.timegm(date)
self.commit_ts = ct
self.commit_ts_cache.set(url, ct)
def _fetch(self, path):
"""Fetch tarball of the specified version ref."""
ref = self.version
url = self._make_repo_url_path('tarball', ref)
resp = self._make_request(url)
with open(path, 'wb') as fout:
while True:
d = resp.read(4096)
if not d:
break
fout.write(d)
def _make_repo_url_path(self, *args):
url = '/repos/{0}/{1}'.format(self.owner, self.repo)
if args:
url += '/' + '/'.join(args)
return url
def _make_request(self, path):
"""Request GitHub API endpoint on ``path``."""
url = 'https://api.github.com' + path
headers = {
'Accept': 'application/vnd.github.v3+json',
'User-Agent': 'OpenWrt',
}
req = urllib2.Request(url, headers=headers)
sslcontext = ssl._create_unverified_context()
fileobj = urllib2.urlopen(req, context=sslcontext)
return fileobj
class DownloadMethodCatchall(DownloadMethod):
"""Dummy method that knows names but not ways of download."""
def __init__(self, args):
super(DownloadMethodCatchall, self).__init__(args)
self.args = args
self.proto = args.proto
self.name = self._resolve_name()
def _resolve_name(self):
if self.proto:
return self.proto
methods_map = (
('default', ('@APACHE/', '@GITHUB/', '@GNOME/', '@GNU/',
'@KERNEL/', '@SF/', '@SAVANNAH/', 'ftp://', 'http://',
'https://', 'file://')),
('git', ('git://', )),
('svn', ('svn://', )),
('cvs', ('cvs://', )),
('bzr', ('sftp://', )),
('bzr', ('sftp://', )),
('unknown', ('', )),
)
for name, prefixes in methods_map:
if any(url.startswith(prefix) for prefix in prefixes for url in self.urls):
return name
@staticmethod
def match(args):
"""Return True."""
return True
def download(self):
"""Not implemented.
raise DownloadException
"""
raise DownloadException('download method for %s is not yet implemented' % self.name)
# order matters
DOWNLOAD_METHODS = [
DownloadMethodGitHubTarball,
DownloadMethodCatchall,
]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('action', choices=('dl_method', 'dl'), help='Action to take')
parser.add_argument('--urls', nargs='+', metavar='URL', help='Download URLs')
parser.add_argument('--proto', help='Download proto')
parser.add_argument('--subdir', help='Source code subdir name')
parser.add_argument('--version', help='Source code version')
parser.add_argument('--source', help='Source tarball filename')
parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
args = parser.parse_args()
if args.action == 'dl_method':
method = DownloadMethod.resolve(args)
sys.stdout.write(method.name + '\n')
elif args.action == 'dl':
method = DownloadMethod.resolve(args)
try:
method.download()
except Exception:
raise
if __name__ == '__main__':
main()