Skip to content

Commit

Permalink
👷Implement build-time pip bundling @ ensurepip
Browse files Browse the repository at this point in the history
Prior to this patch, Pip wheels were stored in the Git repository of
CPython. Git is optimized for text but these artifacts are binary. So
the unpleasant side effect of doing this is that the bare Git
repository size is being increased by the zip archive side every time
it is added, removed or modified. It's time to put a stop to this.

The patch implements an `ensurepip.bundle` module that is meant to be
called through `runpy` to download the Pip wheel and place it into the
same location as before. It removes the wheel file from the Git
repository and prevents re-adding it by defining a new `.gitignore`
configuration file.

The idea is that the builders of CPython are supposed to invoke the
following command during the build time:

```console
$ python -m ensurepip.bundle
```

This command will verify the existing wheel's SHA-256 hash and, if it
does not match, or doesn't exist, it will proceed to download the
artifact from PyPI. It will confirm its SHA-256 hash before placing it
into the `Lib/ensurepip/_bundled/` directory.

Every single line added or modified as a part of this change is also
covered with tests. Every new module has 100% coverage. The only
uncovered lines under `Lib/ensurepip/` are the ones that are
absolutely unrelated to this effort.

Resolves python#80789.

Ref: https://bugs.python.org/issue36608.
  • Loading branch information
webknjaz committed Aug 26, 2023
1 parent 713afb8 commit beec28a
Show file tree
Hide file tree
Showing 14 changed files with 650 additions and 240 deletions.
33 changes: 0 additions & 33 deletions .github/workflows/verify-ensurepip-wheels.yml

This file was deleted.

108 changes: 20 additions & 88 deletions Lib/ensurepip/__init__.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,16 @@
import collections
"""Bundled Pip installer."""

import os
import os.path
import pathlib
import shutil
import subprocess
import sys
import sysconfig
import tempfile
from importlib import resources


__all__ = ["version", "bootstrap"]
_PACKAGE_NAMES = ('pip',)
_PIP_VERSION = "23.2.1"
_PROJECTS = [
("pip", _PIP_VERSION, "py3"),
]
from ._wheelhouses import discover_ondisk_packages

# Packages bundled in ensurepip._bundled have wheel_name set.
# Packages from WHEEL_PKG_DIR have wheel_path set.
_Package = collections.namedtuple('Package',
('version', 'wheel_name', 'wheel_path'))

# Directory of system wheel packages. Some Linux distribution packaging
# policies recommend against bundling dependencies. For example, Fedora
# installs wheel packages in the /usr/share/python-wheels/ directory and don't
# install the ensurepip._bundled package.
_WHEEL_PKG_DIR = sysconfig.get_config_var('WHEEL_PKG_DIR')


def _find_packages(path):
packages = {}
try:
filenames = os.listdir(path)
except OSError:
# Ignore: path doesn't exist or permission error
filenames = ()
# Make the code deterministic if a directory contains multiple wheel files
# of the same package, but don't attempt to implement correct version
# comparison since this case should not happen.
filenames = sorted(filenames)
for filename in filenames:
# filename is like 'pip-21.2.4-py3-none-any.whl'
if not filename.endswith(".whl"):
continue
for name in _PACKAGE_NAMES:
prefix = name + '-'
if filename.startswith(prefix):
break
else:
continue

# Extract '21.2.4' from 'pip-21.2.4-py3-none-any.whl'
version = filename.removeprefix(prefix).partition('-')[0]
wheel_path = os.path.join(path, filename)
packages[name] = _Package(version, None, wheel_path)
return packages


def _get_packages():
global _PACKAGES, _WHEEL_PKG_DIR
if _PACKAGES is not None:
return _PACKAGES

packages = {}
for name, version, py_tag in _PROJECTS:
wheel_name = f"{name}-{version}-{py_tag}-none-any.whl"
packages[name] = _Package(version, wheel_name, None)
if _WHEEL_PKG_DIR:
dir_packages = _find_packages(_WHEEL_PKG_DIR)
# only used the wheel package directory if all packages are found there
if all(name in dir_packages for name in _PACKAGE_NAMES):
packages = dir_packages
_PACKAGES = packages
return packages
_PACKAGES = None
__all__ = ("version", "bootstrap")


def _run_pip(args, additional_paths=None):
Expand Down Expand Up @@ -105,7 +43,7 @@ def version():
"""
Returns a string specifying the bundled version of pip.
"""
return _get_packages()['pip'].version
return discover_ondisk_packages()['pip'].project_version


def _disable_pip_configuration_settings():
Expand Down Expand Up @@ -164,27 +102,18 @@ def _bootstrap(*, root=None, upgrade=False, user=False,
# omit pip
os.environ["ENSUREPIP_OPTIONS"] = "install"

ondisk_dist_pkgs_map = discover_ondisk_packages()
with tempfile.TemporaryDirectory() as tmpdir:
# Put our bundled wheels into a temporary directory and construct the
# additional paths that need added to sys.path
tmpdir_path = pathlib.Path(tmpdir)
additional_paths = []
for name, package in _get_packages().items():
if package.wheel_name:
# Use bundled wheel package
wheel_name = package.wheel_name
wheel_path = resources.files("ensurepip") / "_bundled" / wheel_name
whl = wheel_path.read_bytes()
else:
# Use the wheel package directory
with open(package.wheel_path, "rb") as fp:
whl = fp.read()
wheel_name = os.path.basename(package.wheel_path)

filename = os.path.join(tmpdir, wheel_name)
with open(filename, "wb") as fp:
fp.write(whl)

additional_paths.append(filename)
for package in ondisk_dist_pkgs_map.values():
with package.as_pathlib_ctx() as bundled_wheel_path:
tmp_wheel_path = tmpdir_path / bundled_wheel_path.name
shutil.copy2(bundled_wheel_path, tmp_wheel_path)

additional_paths.append(str(tmp_wheel_path))

# Construct the arguments to be passed to the pip command
args = ["install", "--no-cache-dir", "--no-index", "--find-links", tmpdir]
Expand All @@ -197,7 +126,9 @@ def _bootstrap(*, root=None, upgrade=False, user=False,
if verbosity:
args += ["-" + "v" * verbosity]

return _run_pip([*args, *_PACKAGE_NAMES], additional_paths)
bundled_project_names = list(ondisk_dist_pkgs_map.keys())
return _run_pip(args + bundled_project_names, additional_paths)


def _uninstall_helper(*, verbosity=0):
"""Helper to support a clean default uninstall process on Windows
Expand Down Expand Up @@ -227,7 +158,8 @@ def _uninstall_helper(*, verbosity=0):
if verbosity:
args += ["-" + "v" * verbosity]

return _run_pip([*args, *reversed(_PACKAGE_NAMES)])
bundled_project_names = list(discover_ondisk_packages().keys())
return _run_pip(args + bundled_project_names)


def _main(argv=None):
Expand Down
3 changes: 3 additions & 0 deletions Lib/ensurepip/_bundled/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*
!.gitignore
!README.md
23 changes: 23 additions & 0 deletions Lib/ensurepip/_bundled/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Upstream packaging

To populate this directory, the initial build packagers are supposed
to invoke the following command:

```console
$ python -m ensurepip.bundle
```

It will download a pre-defined version of the Pip wheel. Its SHA-256
hash is guaranteed to match the one on PyPI.

# Downstream packaging

Packagers of the downstream distributions are welcome to put an
alternative wheel version in the directory defined by the
`WHEEL_PKG_DIR` configuration setting. If this is done,

```console
$ python -m ensurepip
```

will prefer the replacement distribution package over the bundled one.
Binary file removed Lib/ensurepip/_bundled/pip-23.2.1-py3-none-any.whl
Binary file not shown.
40 changes: 40 additions & 0 deletions Lib/ensurepip/_bundler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Build time dist downloading and bundling logic."""

from __future__ import annotations

import sys
from contextlib import suppress
from importlib.resources import as_file as _traversable_to_pathlib_ctx

from ._structs import BUNDLED_WHEELS_PATH, REMOTE_DIST_PKGS


def ensure_wheels_are_downloaded(*, verbosity: bool = False) -> None:
"""Download wheels into bundle if they are not there yet."""
for pkg in REMOTE_DIST_PKGS:
existing_whl_file_path = BUNDLED_WHEELS_PATH / pkg.wheel_file_name
with suppress(FileNotFoundError):
if pkg.matches(existing_whl_file_path.read_bytes()):
if verbosity:
print(
f'A valid `{pkg.wheel_file_name}` is already '
'present in cache. Skipping download.',
file=sys.stderr,
)
continue

if verbosity:
print(
f'Downloading `{pkg.wheel_file_name}`...',
file=sys.stderr,
)
downloaded_whl_contents = pkg.download_verified_wheel_contents()

if verbosity:
print(
f'Saving `{pkg.wheel_file_name}` to disk...',
file=sys.stderr,
)
with _traversable_to_pathlib_ctx(BUNDLED_WHEELS_PATH) as bundled_dir:
whl_file_path = bundled_dir / pkg.wheel_file_name
whl_file_path.write_bytes(downloaded_whl_contents)
Loading

0 comments on commit beec28a

Please sign in to comment.