Viewing File: /home/ubuntu/.local/lib/python3.10/site-packages/fastdownload/core.py

# AUTOGENERATED! DO NOT EDIT! File to edit: 00_core.ipynb (unless otherwise specified).

__all__ = ['download_url', 'path_stats', 'checks_module', 'read_checks', 'check', 'update_checks', 'download_and_check',
           'FastDownload']

# Cell
from fastprogress.fastprogress import progress_bar
from fastcore.all import *
import hashlib,shutil
from pprint import pformat

# Cell
def download_url(url, dest=None, timeout=None, show_progress=True):
    "Download `url` to `dest` and show progress"
    pbar = progress_bar([])
    def progress(count=1, bsize=1, tsize=None):
        pbar.total = tsize
        pbar.update(count*bsize)
    return urlsave(url, dest, reporthook=progress if show_progress else None, timeout=timeout)

# Cell
def path_stats(fpath):
    "`True` if size and hash of `fpath` matches `size_check` and `hash_check`"
    size = os.path.getsize(fpath)
    # Just use first 1MB of file for performance
    with open(fpath, "rb") as f: hashed = hashlib.md5(f.read(2**20)).hexdigest()
    return size,hashed

# Cell
def checks_module(module):
    "Location of `download_checks.py`"
    if not module: return {}
    return Path(module.__file__).parent/'download_checks.py'

# Cell
def read_checks(fmod):
    "Evaluated contents of `download_checks.py`"
    if fmod == {} or not fmod.exists(): return {}
    txt = fmod.read_text()
    return eval(txt) if txt else {}

# Cell
def check(fmod, url, fpath):
    "Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
    checks = read_checks(fmod).get(url)
    return not checks or path_stats(fpath)==checks

# Cell
def update_checks(fpath, url, fmod):
    "Store the hash and size of `fpath` for `url` in `download_checks.py`"
    checks = read_checks(fmod)
    checks[url] = path_stats(fpath)
    fmod.write_text(pformat(checks))

# Cell
def download_and_check(url, fpath, fmod, force):
    "Download `url` to `fpath`, unless exists and `check` fails and not `force`"
    if not force and fpath.exists():
        if check(fmod, url, fpath): return fpath
        else: print("Downloading a new version of this dataset...")
    res = download_url(url, fpath)
    if not check(fmod, url, fpath): raise Exception("Downloaded file is corrupt or not latest version")
    return res

# Cell
class FastDownload:
    def __init__(self, cfg=None, base='~/.fastdownload', archive=None, data=None, module=None):
        base = Path(base).expanduser().absolute()
        default = {'data':(data or 'data'), 'archive':(archive or 'archive')}
        self.cfg = Config(base, 'config.ini', create=default) if cfg is None else cfg
        self.module = checks_module(module)
        if data is not None: self.cfg['data'] = data
        if archive is not None: self.cfg['archive'] = archive

    def arch_path(self):
        "Path to archives"
        return self.cfg.path('archive')

    def data_path(self, extract_key='data', arch=None):
        "Path to extracted data"
        path = self.cfg.path(extract_key)
        return path if arch is None else path/remove_suffix(arch.stem, '.tar')

    def check(self, url, fpath):
        "Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
        checks = read_checks(self.module).get(url)
        return not checks or path_stats(fpath)==checks

    def download(self, url, force=False):
        "Download `url` to archive path, unless exists and `self.check` fails and not `force`"
        self.arch_path().mkdir(exist_ok=True, parents=True)
        return download_and_check(url, urldest(url, self.arch_path()), self.module, force)

    def rm(self, url, rm_arch=True, rm_data=True, extract_key='data'):
        "Delete downloaded archive and extracted data for `url`"
        arch = urldest(url, self.arch_path())
        if rm_arch: arch.delete()
        if rm_data: self.data_path(extract_key, arch).delete()

    def update(self, url):
        "Store the hash and size in `download_checks.py`"
        update_checks(urldest(url, self.arch_path()), url, self.module)

    def extract(self, url, extract_key='data', force=False):
        "Extract archive already downloaded from `url`, overwriting existing if `force`"
        arch = urldest(url, self.arch_path())
        if not arch.exists(): raise Exception(f'{arch} does not exist')
        dest = self.data_path(extract_key)
        dest.mkdir(exist_ok=True, parents=True)
        return untar_dir(arch, dest, rename=True, overwrite=force)

    def get(self, url, extract_key='data', force=False):
        "Download and extract `url`, overwriting existing if `force`"
        if not force:
            data = self.data_path(extract_key, urldest(url, self.arch_path()))
            if data.exists(): return data
        self.download(url, force=force)
        return self.extract(url, extract_key=extract_key, force=force)
Back to Directory File Manager