scripts/dl_github_archive.py: rename from download.py

- Make the code more GitHub-specific
 - Requires mirror hash to work with .gitattributes
 - Use different API depending on whether PKG_SOURCE_VERSION is a
   complete commit id or other ref types like tags
 - Fix removing symbolic link
 - pre-clean dir_untar for possible leftovers from previous run

Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
This commit is contained in:
Yousong Zhou 2018-06-28 18:27:27 +08:00
parent e48ea13b3b
commit 04b9f85873
2 changed files with 130 additions and 125 deletions

View file

@ -172,25 +172,25 @@ endef
define DownloadMethod/git
$(call wrap_mirror,$(1),$(2), \
$(call DownloadMethod/git-raw) \
$(call DownloadMethod/rawgit) \
)
endef
define DownloadMethod/github-tarball
define DownloadMethod/github_archive
$(call wrap_mirror,$(1),$(2), \
$(SCRIPT_DIR)/download.py dl \
$(SCRIPT_DIR)/dl_github_archive.py \
--dl-dir="$(DL_DIR)" \
--url $(foreach url,$(URL),"$(url)") \
--proto="$(PROTO)" \
--url="$(URL)" \
--version="$(VERSION)" \
--subdir="$(SUBDIR)" \
--source="$(FILE)" \
|| ( $(call DownloadMethod/git-raw) ); \
--hash="$(MIRROR_HASH)" \
|| ( $(call DownloadMethod/rawgit) ); \
)
endef
# Only intends to be called as a submethod from other DownloadMethod
define DownloadMethod/git-raw
define DownloadMethod/rawgit
echo "Checking out files from the git repository..."; \
mkdir -p $(TMP_DIR)/dl && \
cd $(TMP_DIR)/dl && \

View file

@ -10,6 +10,7 @@ import calendar
import datetime
import errno
import fcntl
import hashlib
import json
import os
import os.path
@ -23,26 +24,31 @@ import urllib2
TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
TMPDIR_DL = os.path.join(TMPDIR, 'dl')
DOWNLOAD_METHODS = []
class PathException(Exception): pass
class DownloadException(Exception): pass
class DownloadGitHubError(Exception): pass
class Path(object):
"""Context class for preparing and cleaning up directories.
If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
If ``path`` ``isdir``, then it will be created on context enter.
If ``keep`` is True, then ``path`` will NOT be removed on context exit
"""
def __init__(self, path, isdir=True, keep=False):
def __init__(self, path, isdir=True, preclean=False, keep=False):
self.path = path
self.isdir = isdir
self.preclean = preclean
self.keep = keep
def __enter__(self):
if self.preclean:
self.rm_all(self.path)
if self.isdir:
self.mkdir_all(self.path)
return self
@ -61,14 +67,11 @@ class Path(object):
Path._mkdir(p)
@staticmethod
def _rmdir_all(dir_):
def _rmdir_dir(dir_):
names = Path._listdir(dir_)
for name in names:
p = os.path.join(dir_, name)
if os.path.isdir(p):
Path._rmdir_all(p)
else:
Path._remove(p)
Path.rm_all(p)
Path._rmdir(dir_)
@staticmethod
@ -105,8 +108,10 @@ class Path(object):
@staticmethod
def rm_all(path):
"""Same as rm -r."""
if os.path.isdir(path):
Path._rmdir_all(path)
if os.path.islink(path):
Path._remove(path)
elif os.path.isdir(path):
Path._rmdir_dir(path)
else:
Path._remove(path)
@ -201,60 +206,47 @@ class GitHubCommitTsCache(object):
fout.write(line)
class DownloadMethod(object):
"""Base class of all download method."""
class DownloadGitHubTarball(object):
"""Download and repack archive tarabll from GitHub.
def __init__(self, args):
self.args = args
self.urls = args.urls
self.url = self.urls[0]
self.dl_dir = args.dl_dir
Compared with the method of packing after cloning the whole repo, this
method is more friendly to users with fragile internet connection.
@classmethod
def resolve(cls, args):
"""Resolve download method to use.
However, there are limitations with this method
return instance of subclass of DownloadMethod
- GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
This affects fetching commit date for reproducible tarballs. Download
through the archive link is not affected.
- GitHub archives do not contain source codes for submodules.
- GitHub archives seem to respect .gitattributes and ignore pathes with
export-ignore attributes.
For the first two issues, the method will fail loudly to allow fallback to
clone-then-pack method.
As for the 3rd issue, to make sure that this method only produces identical
tarballs as the fallback method, we require the expected hash value to be
supplied. That means the first tarball will need to be prepared by the
clone-then-pack method
"""
for c in DOWNLOAD_METHODS:
if c.match(args):
return c(args)
@staticmethod
def match(args):
"""Return True if it can do the download."""
return NotImplemented
def download(self):
"""Do the download and put it into the download dir."""
return NotImplemented
class DownloadMethodGitHubTarball(DownloadMethod):
"""Download and repack archive tarabll from GitHub."""
__repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
def __init__(self, args):
super(DownloadMethodGitHubTarball, self).__init__(args)
self._init_owner_repo()
self.dl_dir = args.dl_dir
self.version = args.version
self.subdir = args.subdir
self.source = args.source
self.url = args.url
self._init_owner_repo()
self.xhash = args.hash
self._init_hasher()
self.commit_ts = None # lazy load commit timestamp
self.commit_ts_cache = GitHubCommitTsCache()
self.name = 'github-tarball'
@staticmethod
def match(args):
"""Match if it's a GitHub clone url."""
url = args.urls[0]
proto = args.proto
if proto == 'git' and isinstance(url, basestring) \
and (url.startswith('https://github.com/') or url.startswith('git://github.com/')):
return True
return False
def download(self):
"""Download and repack GitHub archive tarball."""
self._init_commit_ts()
@ -265,18 +257,23 @@ class DownloadMethodGitHubTarball(DownloadMethod):
self._fetch(tarball_path)
# unpack
d = os.path.join(dir_dl.path, self.subdir + '.untar')
with Path(d) as dir_untar:
with Path(d, preclean=True) as dir_untar:
tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
dir0 = os.path.join(dir_untar.path, tarball_prefix)
dir1 = os.path.join(dir_untar.path, self.subdir)
# submodules check
if self._has_submodule(dir0):
raise DownloadException('unable to fetch submodules\' source code')
raise self._error('Fetching submodules is not yet supported')
# rename subdir
os.rename(dir0, dir1)
# repack
into=os.path.join(TMPDIR_DL, self.source)
Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
try:
self._hash_check(into)
except Exception:
Path.rm_all(into)
raise
# move to target location
file1 = os.path.join(self.dl_dir, self.source)
if into != file1:
@ -291,10 +288,9 @@ class DownloadMethodGitHubTarball(DownloadMethod):
return e.errno != errno.ENOENT
def _init_owner_repo(self):
url = self.url
m = self.__repo_url_regex.search(url)
m = self.__repo_url_regex.search(self.url)
if m is None:
raise DownloadException('invalid github url: %s' % url)
raise self._error('Invalid github url: {}'.format(self.url))
owner = m.group('owner')
repo = m.group('repo')
if repo.endswith('.git'):
@ -302,23 +298,79 @@ class DownloadMethodGitHubTarball(DownloadMethod):
self.owner = owner
self.repo = repo
def _init_hasher(self):
xhash = self.xhash
if len(xhash) == 64:
self.hasher = hashlib.sha256()
elif len(xhash) == 32:
self.hasher = hashlib.md5()
else:
raise self._error('Requires sha256sum for verification')
self.xhash = xhash
def _hash_check(self, f):
with open(f, 'rb') as fin:
while True:
d = fin.read(4096)
if not d:
break
self.hasher.update(d)
xhash = self.hasher.hexdigest()
if xhash != self.xhash:
raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
def _init_commit_ts(self):
if self.commit_ts is not None:
return
url = self._make_repo_url_path('git', 'commits', self.version)
# GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
# terse while API[2] provides more verbose info such as commit diff
# etc. That's the main reason why API[1] is preferred: the response
# size is predictable.
#
# However, API[1] only accepts complete commit sha1sum as the parameter
# while API[2] is more liberal accepting also partial commit id and
# tags, etc.
#
# [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
# [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
apis = [
{
'url': self._make_repo_url_path('git', 'commits', self.version),
'attr_path': ('committer', 'date'),
}, {
'url': self._make_repo_url_path('commits', self.version),
'attr_path': ('commit', 'committer', 'date'),
},
]
version_is_sha1sum = len(self.version) == 40
if not version_is_sha1sum:
apis.insert(0, apis.pop())
for api in apis:
url = api['url']
attr_path = api['attr_path']
try:
ct = self.commit_ts_cache.get(url)
if ct is not None:
self.commit_ts = ct
return
ct = self._init_commit_ts_remote_get(url, attr_path)
self.commit_ts = ct
self.commit_ts_cache.set(url, ct)
return
except Exception:
pass
raise self._error('Cannot fetch commit ts: {}'.format(url))
def _init_commit_ts_remote_get(self, url, attrpath):
resp = self._make_request(url)
data = resp.read()
data = json.loads(data)
date = data['committer']['date']
date = json.loads(data)
for attr in attrpath:
date = date[attr]
date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
date = date.timetuple()
ct = calendar.timegm(date)
self.commit_ts = ct
self.commit_ts_cache.set(url, ct)
return ct
def _fetch(self, path):
"""Fetch tarball of the specified version ref."""
@ -350,71 +402,24 @@ class DownloadMethodGitHubTarball(DownloadMethod):
fileobj = urllib2.urlopen(req, context=sslcontext)
return fileobj
class DownloadMethodCatchall(DownloadMethod):
"""Dummy method that knows names but not ways of download."""
def __init__(self, args):
super(DownloadMethodCatchall, self).__init__(args)
self.args = args
self.proto = args.proto
self.name = self._resolve_name()
def _resolve_name(self):
if self.proto:
return self.proto
methods_map = (
('default', ('@APACHE/', '@GITHUB/', '@GNOME/', '@GNU/',
'@KERNEL/', '@SF/', '@SAVANNAH/', 'ftp://', 'http://',
'https://', 'file://')),
('git', ('git://', )),
('svn', ('svn://', )),
('cvs', ('cvs://', )),
('bzr', ('sftp://', )),
('bzr', ('sftp://', )),
('unknown', ('', )),
)
for name, prefixes in methods_map:
if any(url.startswith(prefix) for prefix in prefixes for url in self.urls):
return name
@staticmethod
def match(args):
"""Return True."""
return True
def download(self):
"""Not implemented.
raise DownloadException
"""
raise DownloadException('download method for %s is not yet implemented' % self.name)
# order matters
DOWNLOAD_METHODS = [
DownloadMethodGitHubTarball,
DownloadMethodCatchall,
]
def _error(self, msg):
return DownloadGitHubError('{}: {}'.format(self.source, msg))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('action', choices=('dl_method', 'dl'), help='Action to take')
parser.add_argument('--urls', nargs='+', metavar='URL', help='Download URLs')
parser.add_argument('--proto', help='Download proto')
parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
parser.add_argument('--url', help='Download URL')
parser.add_argument('--subdir', help='Source code subdir name')
parser.add_argument('--version', help='Source code version')
parser.add_argument('--source', help='Source tarball filename')
parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
args = parser.parse_args()
if args.action == 'dl_method':
method = DownloadMethod.resolve(args)
sys.stdout.write(method.name + '\n')
elif args.action == 'dl':
method = DownloadMethod.resolve(args)
method = DownloadGitHubTarball(args)
try:
method.download()
except Exception:
sys.stderr.write('download {} from {} failed\n'.format(args.source, args.url))
raise
if __name__ == '__main__':