Source code for piwheels.master.the_scribe

# The piwheels project
#   Copyright (c) 2017 Ben Nuttall <https://github.com/bennuttall>
#   Copyright (c) 2017 Dave Jones <dave@waveform.org.uk>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * Neither the name of the copyright holder nor the
#       names of its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

"""
Defines the :class:`TheScribe` task; see class for more details.

.. autoclass:: TheScribe
    :members:
"""

import os
import gzip
import shutil
import tempfile
from pathlib import Path
from datetime import datetime, timezone
from itertools import zip_longest
from operator import itemgetter
from collections import namedtuple

import pkg_resources
from chameleon import PageTemplateLoader
import simplejson as json

from .. import const, protocols, tasks, transport
from ..format import format_size
from ..states import mkdir_override_symlink, MasterStats
from .the_oracle import DbClient

UTC = timezone.utc


ProjectRelease = namedtuple('ProjectRelease', (
    'version', 'yanked', 'released', 'skip', 'abis', 'files',
    'builds_succeeded', 'builds_failed'))


class PackageDeleted(ValueError):
    "Error raised when a package is deleted and doesn't need updating"


[docs]class TheScribe(tasks.PauseableTask): """ This task is responsible for writing web-page ``index.html`` files. It reads the names of packages off the internal "indexes" queue and rebuilds the ``index.html`` for that package and, optionally, the overall ``index.html`` if the package is one that wasn't previously present. .. note:: It is important to note that package names are never pushed into the internal "indexes" queue until all file-transfers associated with the build are complete. Furthermore, while the entire index for a package is re-built, hashes are *never* re-calculated from the disk files (they are always read from the database). """ name = 'master.the_scribe' def __init__(self, config): super().__init__(config) self.output_path = Path(config.output_path) scribe_queue = self.socket( transport.REP, protocol=protocols.the_scribe) scribe_queue.hwm = 100 scribe_queue.bind(const.SCRIBE_QUEUE) self.register(scribe_queue, self.handle_index) self.db = DbClient(config, self.logger) self.package_cache = None self.statistics = {} self.templates = PageTemplateLoader( search_path=[ pkg_resources.resource_filename(__name__, 'templates') ], default_extension='.pt')
[docs] def close(self): self.db.close() super().close() pkg_resources.cleanup_resources()
[docs] def once(self): self.setup_output_path() self.logger.info('building package cache') self.package_cache = self.db.get_all_packages() # Perform a one-time write of the root index if it doesn't exist; this # is primarily for limited setups which don't expect to see "new" # packages show up (the usual trigger for re-writing the root index) if not (self.output_path / 'simple' / 'index.html').exists(): self.write_simple_index()
[docs] def setup_output_path(self): """ Called on task startup to copy all static resources into the output path (and to make sure the output path exists as a directory). """ self.logger.info('setting up output path') required_paths = ( self.output_path, self.output_path / 'simple', self.output_path / 'project', ) for path in required_paths: try: path.mkdir() except FileExistsError: pass for filename in pkg_resources.resource_listdir(__name__, 'static'): source = pkg_resources.resource_stream(__name__, 'static/' + filename) with AtomicReplaceFile(self.output_path / filename) as f: shutil.copyfileobj(source, f) startup_templates = { 'faq.pt': ('FAQ', 'frequently asked questions about the piwheels project'), 'packages.pt': ('Package search', 'search for packages in the piwheels repository'), 'stats.pt': ('Stats', 'piwheels usage statistics'), 'json.pt': ('JSON API', 'information about the piwheels JSON API'), '404.pt': ('404 - file not found', 'file not found'), } for filename in pkg_resources.resource_listdir(__name__, 'templates'): if filename in startup_templates: title, description = startup_templates[filename] source = self.templates[filename]( layout=self.templates['layout']['layout'], page=filename.replace('.pt', ''), title=title, description=description, ) with AtomicReplaceFile( (self.output_path / filename).with_suffix('.html'), encoding='utf-8') as f: f.write(source)
[docs] def handle_index(self, queue): """ Handle incoming requests to (re)build index files. These will be in the form of: * "HOME", a request to write the homepage with some associated statistics * "BOTH", a request to write the index and project page for the specified package * "PROJECT", a request to write just the project page for the specified package * "LOG", a request to write a build log .. note:: In all handlers below, care is taken to ensure clients never see a partially written file and that temporary files are cleaned up in the event of any exceptions. """ try: msg, data = queue.recv_msg() except IOError as e: self.logger.error(str(e)) else: if msg in ('BOTH', 'PROJECT'): package = data if msg == 'BOTH': self.write_pages(package, both=True) else: self.write_pages(package) elif msg == 'HOME': self.write_homepage(MasterStats.from_message(data)) self.write_sitemap() elif msg == 'SEARCH': search_index = data self.write_search_index(search_index) elif msg == 'LOG': build_id, log = data self.write_log(build_id, log) elif msg == 'DELVER': package, version = data self.delete_version(package, version) self.write_pages(package, both=True, exclude={version}) elif msg == 'DELPKG': package = data self.package_cache.discard(package) self.write_simple_index() self.delete_package(package) queue.send_msg('DONE')
[docs] def write_homepage(self, statistics): """ Re-writes the site homepage using the provided statistics in the homepage template (which is effectively a simple Python format string). :param dict statistics: A dict containing statistics obtained by :class:`BigBrother`. """ self.logger.info('writing homepage') dt = datetime.now(tz=UTC) with AtomicReplaceFile(self.output_path / 'index.html', encoding='utf-8') as index: index.file.write(self.templates['index']( layout=self.templates['layout']['layout'], timestamp=dt.strftime('%Y-%m-%d %H:%M %Z'), page='home', title='Home', description='Python package repository providing wheels for Raspberry Pi', stats=statistics, ))
[docs] def write_search_index(self, search_index): """ Re-writes the JSON search index using the provided statistics. :param dict search_index: A dict mapping package names to their download count obtained by :class:`BigBrother`. """ self.logger.info('writing search index') with AtomicReplaceFile(self.output_path / 'packages.json', encoding='utf-8') as index: # Re-organize into a list of package, count tuples as this is # what the JS actually wants search_index = [ (package, count_recent, count_all) for package, (count_recent, count_all) in search_index.items() ] json.dump(search_index, index.file, check_circular=False, separators=(',', ':'))
[docs] def write_sitemap(self): """ (Re)writes the XML sitemap pages and index. """ self.logger.info('writing sitemap') pages = ['index.html', 'packages.html', 'faq.html', 'json.html', 'stats.html'] with AtomicReplaceFile(self.output_path / 'sitemap0.xml', encoding='utf-8') as page: page.file.write(self.templates['sitemap_static'](pages=pages)) links_per_page = 50000 # google sitemap limit n = 0 pages = grouper(self.package_cache, links_per_page) for n, packages in enumerate(pages, start=1): with AtomicReplaceFile(self.output_path / 'sitemap{}.xml'.format(n), encoding='utf-8') as page: page.file.write(self.templates['sitemap_page']( packages=packages) ) dt = datetime.now() with AtomicReplaceFile(self.output_path / 'sitemap.xml', encoding='utf-8') as sitemap: sitemap.file.write(self.templates['sitemap_index']( pages=range(n), timestamp=dt.strftime('%Y-%m-%d')) )
[docs] def write_simple_index(self): """ (Re)writes the index of all packages. This is implicitly called when a request to write a package index is received for a package not present in the task's cache. """ self.logger.info('writing package index') with AtomicReplaceFile(self.output_path / 'simple' / 'index.html', encoding='utf-8') as index: index.file.write(self.templates['simple_index']( packages=self.package_cache))
[docs] def write_pages(self, package, *, both=False, exclude=None): """ (Re)writes the project page and project JSON file (and simple index if *both* is True) for the specified *package*. :param str package: The name of the package to write the pages for :param bool both: Write both the project page and the simple page if True, otherwise only write the project page. Note project page also includes project JSON. :type exclude: set or None :param exclude: The set of (deleted) versions to exclude from pages. Defaults to ``None``. """ if exclude is None: exclude = set() data = self.db.get_project_data(package) # Rewrite versions as version objects, exclude deleted versions, and # sort the releases dict by the parsed version data['releases'] = { parse_version(version): vers_data for version, vers_data in data['releases'].items() if version not in exclude } data['releases'] = { version: vers_data for version, vers_data in sorted( data['releases'].items(), key=itemgetter(0), reverse=True) } if both: self.write_package_index(package, data) self.write_project_page(package, data) self.write_project_json(package, data)
[docs] def write_package_index(self, package, data): """ (Re)writes the index of the specified *package*. The file meta-data (including the hash) is retrieved from the database, *never* from the file-system. The *data* parameter is expected to be the dictionary of package data returned by :meth:`.db.Database.get_project_data`. This is expected to have at least the following content in the example case of a package named "foo" with version "1.0" containing a validly built wheel:: { 'releases': { '1.0': { 'files': { 'foo-1.0-py3-none-any.whl': { 'hash': 'abcdef1234567890...', 'requires_python': '>= 3.6', }, }, 'yanked': False, }, }, } :param str package: The name of the package to write the index page for. :param dict data: The dictionary of data returned by :meth:`.db.Database.get_project_data` which is expected to have at least the structure documented above. """ self.logger.info('writing index for %s', package) files = [ { 'filename': filename, 'filehash': file_data['hash'], 'requires_python': file_data['requires_python'], 'yanked': vers_data['yanked'], } for vers, vers_data in data['releases'].items() for filename, file_data in vers_data['files'].items() ] pkg_dir = self.output_path / 'simple' / package mkdir_override_symlink(pkg_dir) with AtomicReplaceFile(pkg_dir / 'index.html', encoding='utf-8') as index: index.file.write( self.templates['simple_package']( package=package, files=files)) if package not in self.package_cache: self.package_cache.add(package) self.write_simple_index()
[docs] def write_project_page(self, package, data): """ (Re)writes the project page of the specified package. The *data* parameter is expected to be the dictionary of package data returned by :meth:`.db.Database.get_project_data`. This is expected to have at least the following content in the example case of a package named "foo" with version "1.0" containing a validly built wheel:: { 'name': 'foo', 'description': 'A foomatic package', 'releases': { '1.0': { 'abis': { 'cp35m': { 'build_id': 1, 'status': 'success', 'skip': '', } } 'files': { 'foo-1.0-py3-none-any.whl': { 'hash': 'abcdef1234567890...', 'size': 123456, 'apt_dependencies': {'libc6'}, }, }, 'released': datetime(2000, 1, 1, 12, 34, 56), 'yanked': False, 'skip': '', }, }, } :param str package: The name of the package to write the project page for. :param dict data: The dictionary of data returned by :meth:`.db.Database.get_project_data` which is expected to have at least the structure documented above. """ self.logger.info('writing project page for %s', package) # This horribly confusing loop simply serves to efficiently extract # the apt_dependencies from the latest successful build, which is # reported (by default) as the dependencies at the top of the project # page. Ideally this would be done in the template, but the logic is # just too horrid to do nicely there. We *could* resort to javascript # picking the dependencies from the first row of the table, but that # then denies non-JS browsers (or search engines) any dependency info dependencies = set() for version, release in data['releases'].items(): if not (version.is_prerelease or release['yanked']): for filedata in release['files'].values(): dependencies = filedata['apt_dependencies'] break else: continue break project_dir = self.output_path / 'project' / package mkdir_override_symlink(project_dir) dt = datetime.now(tz=UTC) with AtomicReplaceFile(project_dir / 'index.html', encoding='utf-8') as index: index.file.write( self.templates['project']( layout=self.templates['layout']['layout'], title=data['name'], description=data['description'], timestamp=datetime.now(tz=UTC), page='project', package=package, releases=data['releases'], dependencies=dependencies, format_size=format_size)) project_aliases = self.db.get_package_aliases(package) if project_aliases: self.logger.info('creating %s symlinks for project %s', len(project_aliases), package) for project_alias in project_aliases: project_symlink = self.output_path / 'project' / project_alias try: project_symlink.symlink_to(project_dir.name) except FileExistsError: pass
[docs] def write_project_json(self, package, data): """ (Re)writes the project JSON data of the specified package. The *data* parameter is expected to be the dictionary of package data returned by :meth:`.db.Database.get_project_data`. This is expected to have at least the following content in the example case of a package named "foo" with version "1.0" containing a validly built wheel:: { 'name': 'foo', 'description': 'A foomatic package', 'releases': { '1.0': { 'abis': { 'cp35m': { 'build_id': 1, 'status': 'success', 'skip': '', } } 'files': { 'foo-1.0-py3-none-any.whl': { 'hash': 'abcdef1234567890...', 'size': 123456, 'apt_dependencies': {'libc6'}, }, }, 'released': datetime(2000, 1, 1, 12, 34, 56), 'yanked': False, 'skip': '', }, }, } :param str package: The name of the package to write the project data for. :param dict data: The dictionary of data returned by :meth:`.db.Database.get_project_data` which is expected to have at least the structure documented above. """ self.logger.info('writing project json for %s', package) project_data = { 'package': package, 'summary': data['description'], 'pypi_url': 'https://pypi.org/project/{}'.format(package), 'piwheels_url': 'https://www.piwheels.org/project/{}'.format(package), 'releases': { version.original: { 'released': vers_data['released'].strftime('%Y-%m-%d %H:%M:%S'), 'prerelease': version.is_prerelease, 'yanked': vers_data['yanked'], 'skip_reason': vers_data['skip'], 'files': { filename: { 'filehash': file_data['hash'], 'filesize': file_data['size'], 'builder_abi': file_data['abi_builder'], 'file_abi_tag': file_data['abi_file'], 'platform': file_data['platform'], 'requires_python': file_data['requires_python'], 'apt_dependencies': sorted(file_data['apt_dependencies']), } for filename, file_data in vers_data['files'].items() }, } for version, vers_data in data['releases'].items() }, } pkg_dir = self.output_path / 'project' / package / 'json' mkdir_override_symlink(pkg_dir) with AtomicReplaceFile(pkg_dir / 'index.json', encoding='utf-8') as index: json.dump(project_data, index)
[docs] def write_log(self, build_id, log): """ Attempts to write the *log* of build *build_id* to the log output directories, splitting the numeric build id into three parts to flatten the output hierarchy. Log data is also gzip compressed. """ self.logger.info('writing log for build %d', build_id) levels = [] n = build_id for i in range(3): n, m = divmod(n, 10000) levels.append(m) levels = ['{:04d}'.format(level) for level in reversed(levels)] log_dir = self.output_path / 'logs' / levels[0] / levels[1] log_dir.mkdir(parents=True, exist_ok=True) # No need for AtomicReplaceFile here. The log we're writing should # *never* exist. In fact, it should be an error if it does hence the # use of the "x" mode with (log_dir / (levels[2] + '.txt.gz')).open('xb') as f: with gzip.open(f, 'wt', encoding='utf-8', errors='replace') as arc: arc.write(log)
[docs] def delete_package(self, package): """ Attempts to remove the index and project page directories (including all known wheel files) of the specified *package*. :param str package: The name of the package to delete. """ self.logger.info('deleting package %s', package) if len(package) == 0: # refuse to delete /simple/ and /project/ by accident raise RuntimeError('Attempted to delete everything') # remove any symlinks for project aliases for project_alias in self.db.get_package_aliases(package): project_symlink = self.output_path / 'project' / project_alias try: project_symlink.unlink() except FileNotFoundError: self.logger.error('symlink not found: %s', project_symlink) pkg_dir = self.output_path / 'simple' / package proj_dir = self.output_path / 'project' / package proj_json_dir = proj_dir / 'json' files = {pkg_dir / f for f in self.db.get_package_files(package)} files |= { pkg_dir / 'index.html', proj_dir / 'index.html', proj_json_dir / 'index.json', } # try to delete every known wheel file, the HTML files and JSON file for file_path in files: try: file_path.unlink() self.logger.debug('file deleted: %s', file_path) except FileNotFoundError: self.logger.error('file not found: %s', file_path) for dir_path in (pkg_dir, proj_json_dir, proj_dir): try: dir_path.rmdir() except OSError as e: self.logger.error('failed to remove directory %s: %s', dir_path, repr(e))
[docs] def delete_version(self, package, version): """ Attempts to remove any known wheel files corresponding with deleted *versions* of the specified *package*. :param str package: The name of the package to delete files for. :param str version: The version of *package* to delete files for. """ self.logger.info('deleting package %s version %s', package, version) pkg_dir = self.output_path / 'simple' / package for file in self.db.get_version_files(package, version): file_path = pkg_dir / file try: file_path.unlink() self.logger.info('File deleted: %s', file) except FileNotFoundError: self.logger.error('File not found: %s', file)
# https://docs.python.org/3/library/itertools.html def grouper(iterable, n, fillvalue=None): "Collect data into fixed-length chunks or blocks" # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" args = [iter(iterable)] * n return zip_longest(*args, fillvalue=fillvalue) def parse_version(s): v = pkg_resources.parse_version(s) # Keep a reference to the original string as otherwise it's unrecoverable; # e.g. 0.1a parses to 0.1a0. As this is different, keyed lookups with the # parsed variant will fail v.original = s return v class AtomicReplaceFile: """ A context manager for atomically replacing a target file. Uses :class:`tempfile.NamedTemporaryFile` to construct a temporary file in the same directory as the target file. The associated file-like object is returned as the context manager's variable; you should write the content you wish to this object. When the context manager exits, if no exception has occurred, the temporary file will be renamed over the target file atomically (and sensible permissions will be set, i.e. 0644 & umask). If an exception occurs during the context manager's block, the temporary file will be deleted leaving the original target file unaffected and the exception will be re-raised. :param pathlib.Path path: The full path and filename of the target file. This is expected to be an absolute path. :param str encoding: If ``None`` (the default), the temporary file will be opened in binary mode. Otherwise, this specifies the encoding to use with text mode. """ def __init__(self, path, encoding=None): if isinstance(path, str): path = Path(path) self._path = path self._tempfile = tempfile.NamedTemporaryFile( mode='wb' if encoding is None else 'w', dir=str(self._path.parent), encoding=encoding, delete=False) self._withfile = None def __enter__(self): self._withfile = self._tempfile.__enter__() return self._withfile def __exit__(self, exc_type, exc_value, exc_tb): os.fchmod(self._withfile.file.fileno(), 0o644) result = self._tempfile.__exit__(exc_type, exc_value, exc_tb) if exc_type is None: os.rename(self._withfile.name, str(self._path)) else: os.unlink(self._withfile.name) return result