# AUTOGENERATED! DO NOT EDIT! File to edit: 00_core.ipynb (unless otherwise specified).
__all__ = ['download_url', 'path_stats', 'checks_module', 'read_checks', 'check', 'update_checks', 'download_and_check',
'FastDownload']
# Cell
from fastprogress.fastprogress import progress_bar
from fastcore.all import *
import hashlib,shutil
from pprint import pformat
# Cell
def download_url(url, dest=None, timeout=None, show_progress=True):
"Download `url` to `dest` and show progress"
pbar = progress_bar([])
def progress(count=1, bsize=1, tsize=None):
pbar.total = tsize
pbar.update(count*bsize)
return urlsave(url, dest, reporthook=progress if show_progress else None, timeout=timeout)
# Cell
def path_stats(fpath):
"`True` if size and hash of `fpath` matches `size_check` and `hash_check`"
size = os.path.getsize(fpath)
# Just use first 1MB of file for performance
with open(fpath, "rb") as f: hashed = hashlib.md5(f.read(2**20)).hexdigest()
return size,hashed
# Cell
def checks_module(module):
"Location of `download_checks.py`"
if not module: return {}
return Path(module.__file__).parent/'download_checks.py'
# Cell
def read_checks(fmod):
"Evaluated contents of `download_checks.py`"
if fmod == {} or not fmod.exists(): return {}
txt = fmod.read_text()
return eval(txt) if txt else {}
# Cell
def check(fmod, url, fpath):
"Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
checks = read_checks(fmod).get(url)
return not checks or path_stats(fpath)==checks
# Cell
def update_checks(fpath, url, fmod):
"Store the hash and size of `fpath` for `url` in `download_checks.py`"
checks = read_checks(fmod)
checks[url] = path_stats(fpath)
fmod.write_text(pformat(checks))
# Cell
def download_and_check(url, fpath, fmod, force):
"Download `url` to `fpath`, unless exists and `check` fails and not `force`"
if not force and fpath.exists():
if check(fmod, url, fpath): return fpath
else: print("Downloading a new version of this dataset...")
res = download_url(url, fpath)
if not check(fmod, url, fpath): raise Exception("Downloaded file is corrupt or not latest version")
return res
# Cell
class FastDownload:
def __init__(self, cfg=None, base='~/.fastdownload', archive=None, data=None, module=None):
base = Path(base).expanduser().absolute()
default = {'data':(data or 'data'), 'archive':(archive or 'archive')}
self.cfg = Config(base, 'config.ini', create=default) if cfg is None else cfg
self.module = checks_module(module)
if data is not None: self.cfg['data'] = data
if archive is not None: self.cfg['archive'] = archive
def arch_path(self):
"Path to archives"
return self.cfg.path('archive')
def data_path(self, extract_key='data', arch=None):
"Path to extracted data"
path = self.cfg.path(extract_key)
return path if arch is None else path/remove_suffix(arch.stem, '.tar')
def check(self, url, fpath):
"Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
checks = read_checks(self.module).get(url)
return not checks or path_stats(fpath)==checks
def download(self, url, force=False):
"Download `url` to archive path, unless exists and `self.check` fails and not `force`"
self.arch_path().mkdir(exist_ok=True, parents=True)
return download_and_check(url, urldest(url, self.arch_path()), self.module, force)
def rm(self, url, rm_arch=True, rm_data=True, extract_key='data'):
"Delete downloaded archive and extracted data for `url`"
arch = urldest(url, self.arch_path())
if rm_arch: arch.delete()
if rm_data: self.data_path(extract_key, arch).delete()
def update(self, url):
"Store the hash and size in `download_checks.py`"
update_checks(urldest(url, self.arch_path()), url, self.module)
def extract(self, url, extract_key='data', force=False):
"Extract archive already downloaded from `url`, overwriting existing if `force`"
arch = urldest(url, self.arch_path())
if not arch.exists(): raise Exception(f'{arch} does not exist')
dest = self.data_path(extract_key)
dest.mkdir(exist_ok=True, parents=True)
return untar_dir(arch, dest, rename=True, overwrite=force)
def get(self, url, extract_key='data', force=False):
"Download and extract `url`, overwriting existing if `force`"
if not force:
data = self.data_path(extract_key, urldest(url, self.arch_path()))
if data.exists(): return data
self.download(url, force=force)
return self.extract(url, extract_key=extract_key, force=force)