HEX

File: //home/arjun/projects/env/lib/python3.10/site-packages/weasyprint/document.py
"""Document generation management."""

import functools
import io
from hashlib import md5
from pathlib import Path

from . import CSS, DEFAULT_OPTIONS
from .anchors import gather_anchors, make_page_bookmark_tree
from .css import get_all_computed_styles
from .css.counters import CounterStyle
from .css.targets import TargetCollector
from .draw import draw_page, stacked
from .formatting_structure.build import build_formatting_structure
from .html import get_html_metadata
from .images import get_image_from_uri as original_get_image_from_uri
from .layout import LayoutContext, layout_document
from .logger import PROGRESS_LOGGER
from .matrix import Matrix
from .pdf import generate_pdf
from .text.fonts import FontConfiguration


class Page:
    """Represents a single rendered page.

    Should be obtained from :attr:`Document.pages` but not
    instantiated directly.

    """
    def __init__(self, page_box):
        #: The page width, including margins, in CSS pixels.
        self.width = page_box.margin_width()

        #: The page height, including margins, in CSS pixels.
        self.height = page_box.margin_height()

        #: The page bleed widths as a :obj:`dict` with ``'top'``, ``'right'``,
        #: ``'bottom'`` and ``'left'`` as keys, and values in CSS pixels.
        self.bleed = {
            side: page_box.style[f'bleed_{side}'].value
            for side in ('top', 'right', 'bottom', 'left')}

        #: The :obj:`list` of ``(level, label, target, state)``
        #: :obj:`tuples <tuple>`. ``level`` and ``label`` are respectively an
        #: :obj:`int` and a :obj:`string <str>`, based on the CSS properties
        #: of the same names. ``target`` is an ``(x, y)`` point in CSS pixels
        #: from the top-left of the page.
        self.bookmarks = []

        #: The :obj:`list` of ``(link_type, target, rectangle, box)``
        #: :obj:`tuples <tuple>`. A ``rectangle`` is ``(x, y, width, height)``,
        #: in CSS pixels from the top-left of the page. ``link_type`` is one of
        #: three strings:
        #:
        #: * ``'external'``: ``target`` is an absolute URL
        #: * ``'internal'``: ``target`` is an anchor name (see
        #:   :attr:`Page.anchors`).
        #:   The anchor might be defined in another page,
        #:   in multiple pages (in which case the first occurence is used),
        #:   or not at all.
        #: * ``'attachment'``: ``target`` is an absolute URL and points
        #:   to a resource to attach to the document.
        self.links = []

        #: The :obj:`dict` mapping each anchor name to its target, an
        #: ``(x, y)`` point in CSS pixels from the top-left of the page.
        self.anchors = {}

        #: The :obj:`list` of ``(element, attributes, rectangle)`` :obj:`tuples
        #: <tuple>`. A ``rectangle`` is ``(x, y, width, height)``, in CSS
        #: pixels from the top-left of the page. ``atributes`` is a
        #: :obj:`dict` of HTML tag attributes and values.
        self.inputs = []

        gather_anchors(
            page_box, self.anchors, self.links, self.bookmarks, self.inputs)
        self._page_box = page_box

    def paint(self, stream, left_x=0, top_y=0, scale=1, clip=False):
        """Paint the page into the PDF file.

        :type stream: ``document.Stream``
        :param stream:
            A document stream.
        :param float left_x:
            X coordinate of the left of the page, in PDF points.
        :param float top_y:
            Y coordinate of the top of the page, in PDF points.
        :param float scale:
            Zoom scale.
        :param bool clip:
            Whether to clip/cut content outside the page. If false or
            not provided, content can overflow.

        """
        with stacked(stream):
            # Make (0, 0) the top-left corner, and make user units CSS pixels:
            stream.transform(a=scale, d=scale, e=left_x, f=top_y)
            if clip:
                stream.rectangle(0, 0, self.width, self.height)
                stream.clip()
            draw_page(self._page_box, stream)


class DocumentMetadata:
    """Meta-information belonging to a whole :class:`Document`.

    New attributes may be added in future versions of WeasyPrint.

    """
    def __init__(self, title=None, authors=None, description=None,
                 keywords=None, generator=None, created=None, modified=None,
                 attachments=None, lang=None, custom=None):
        #: The title of the document, as a string or :obj:`None`.
        #: Extracted from the ``<title>`` element in HTML
        #: and written to the ``/Title`` info field in PDF.
        self.title = title
        #: The authors of the document, as a list of strings.
        #: (Defaults to the empty list.)
        #: Extracted from the ``<meta name=author>`` elements in HTML
        #: and written to the ``/Author`` info field in PDF.
        self.authors = authors or []
        #: The description of the document, as a string or :obj:`None`.
        #: Extracted from the ``<meta name=description>`` element in HTML
        #: and written to the ``/Subject`` info field in PDF.
        self.description = description
        #: Keywords associated with the document, as a list of strings.
        #: (Defaults to the empty list.)
        #: Extracted from ``<meta name=keywords>`` elements in HTML
        #: and written to the ``/Keywords`` info field in PDF.
        self.keywords = keywords or []
        #: The name of one of the software packages
        #: used to generate the document, as a string or :obj:`None`.
        #: Extracted from the ``<meta name=generator>`` element in HTML
        #: and written to the ``/Creator`` info field in PDF.
        self.generator = generator
        #: The creation date of the document, as a string or :obj:`None`.
        #: Dates are in one of the six formats specified in
        #: `W3C’s profile of ISO 8601 <https://www.w3.org/TR/NOTE-datetime>`_.
        #: Extracted from the ``<meta name=dcterms.created>`` element in HTML
        #: and written to the ``/CreationDate`` info field in PDF.
        self.created = created
        #: The modification date of the document, as a string or :obj:`None`.
        #: Dates are in one of the six formats specified in
        #: `W3C’s profile of ISO 8601 <https://www.w3.org/TR/NOTE-datetime>`_.
        #: Extracted from the ``<meta name=dcterms.modified>`` element in HTML
        #: and written to the ``/ModDate`` info field in PDF.
        self.modified = modified
        #: File attachments, as a list of tuples of URL and a description or
        #: :obj:`None`. (Defaults to the empty list.)
        #: Extracted from the ``<link rel=attachment>`` elements in HTML
        #: and written to the ``/EmbeddedFiles`` dictionary in PDF.
        self.attachments = attachments or []
        #: Document language as BCP 47 language tags.
        #: Extracted from ``<html lang=lang>`` in HTML.
        self.lang = lang
        #: Custom metadata, as a dict whose keys are the metadata names and
        #: values are the metadata values.
        self.custom = custom or {}


class DiskCache:
    """Dict-like storing images content on disk.

    Bytestring values are stored on disk. Other lightweight Python objects
    (i.e. RasterImage instances) are still stored in memory.

    """
    def __init__(self, folder):
        self._path = Path(folder)
        self._path.mkdir(parents=True, exist_ok=True)
        self._memory_cache = {}
        self._disk_paths = set()

    def _path_from_key(self, key):
        return self._path / md5(key.encode()).hexdigest()

    def __getitem__(self, key):
        if key in self._memory_cache:
            return self._memory_cache[key]
        else:
            return self._path_from_key(key).read_bytes()

    def __setitem__(self, key, value):
        if isinstance(value, bytes):
            path = self._path_from_key(key)
            self._disk_paths.add(path)
            path.write_bytes(value)
        else:
            self._memory_cache[key] = value

    def __contains__(self, key):
        return (
            key in self._memory_cache or
            self._path_from_key(key).exists())

    def __del__(self):
        try:
            for path in self._disk_paths:
                path.unlink(missing_ok=True)
            self._path.rmdir()
        except Exception:
            # Silently ignore errors while clearing cache
            pass


class Document:
    """A rendered document ready to be painted in a pydyf stream.

    Typically obtained from :meth:`HTML.render() <weasyprint.HTML.render>`, but
    can also be instantiated directly with a list of :class:`pages <Page>`, a
    set of :class:`metadata <DocumentMetadata>`, a :func:`url_fetcher
    <weasyprint.default_url_fetcher>` function, and a :class:`font_config
    <weasyprint.text.fonts.FontConfiguration>`.

    """

    @classmethod
    def _build_layout_context(cls, html, font_config, counter_style, options):
        if font_config is None:
            font_config = FontConfiguration()
        if counter_style is None:
            counter_style = CounterStyle()
        target_collector = TargetCollector()
        page_rules = []
        user_stylesheets = []
        cache = options['cache']
        if cache is None:
            cache = {}
        elif not isinstance(cache, (dict, DiskCache)):
            cache = DiskCache(cache)
        for css in options['stylesheets'] or []:
            if not hasattr(css, 'matcher'):
                css = CSS(
                    guess=css, media_type=html.media_type,
                    font_config=font_config, counter_style=counter_style)
            user_stylesheets.append(css)
        style_for = get_all_computed_styles(
            html, user_stylesheets, options['presentational_hints'],
            font_config, counter_style, page_rules, target_collector,
            options['pdf_forms'])
        get_image_from_uri = functools.partial(
            original_get_image_from_uri, cache=cache,
            url_fetcher=html.url_fetcher, options=options)
        PROGRESS_LOGGER.info('Step 4 - Creating formatting structure')
        context = LayoutContext(
            style_for, get_image_from_uri, font_config, counter_style,
            target_collector)
        return context

    @classmethod
    def _render(cls, html, font_config, counter_style, options):
        if font_config is None:
            font_config = FontConfiguration()

        if counter_style is None:
            counter_style = CounterStyle()

        context = cls._build_layout_context(
            html, font_config, counter_style, options)

        root_box = build_formatting_structure(
            html.etree_element, context.style_for, context.get_image_from_uri,
            html.base_url, context.target_collector, counter_style,
            context.footnotes)

        page_boxes = layout_document(html, root_box, context)
        rendering = cls(
            [Page(page_box) for page_box in page_boxes],
            DocumentMetadata(**get_html_metadata(html)),
            html.url_fetcher, font_config)
        rendering._html = html
        return rendering

    def __init__(self, pages, metadata, url_fetcher, font_config):
        #: A list of :class:`Page` objects.
        self.pages = pages
        #: A :class:`DocumentMetadata` object.
        #: Contains information that does not belong to a specific page
        #: but to the whole document.
        self.metadata = metadata
        #: A function or other callable with the same signature as
        #: :func:`weasyprint.default_url_fetcher` called to fetch external
        #: resources such as stylesheets and images. (See :ref:`URL Fetchers`.)
        self.url_fetcher = url_fetcher
        #: A :obj:`dict` of fonts used by the document. Keys are hashes used to
        #: identify fonts, values are ``Font`` objects.
        self.fonts = {}

        # Keep a reference to font_config to avoid its garbage collection until
        # rendering is destroyed. This is needed as font_config.__del__ removes
        # fonts that may be used when rendering
        self.font_config = font_config

    def build_element_structure(self, structure, etree_element=None):
        if etree_element is None:
            etree_element = self._html.etree_element
            structure[etree_element] = {'parent': None}
        for child in etree_element:
            structure[child] = {'parent': etree_element}
            self.build_element_structure(structure, child)

    def copy(self, pages='all'):
        """Take a subset of the pages.

        :type pages: :term:`iterable`
        :param pages:
            An iterable of :class:`Page` objects from :attr:`pages`.
        :return:
            A new :class:`Document` object.

        Examples:

        Write two PDF files for odd-numbered and even-numbered pages::

            # Python lists count from 0 but pages are numbered from 1.
            # [::2] is a slice of even list indexes but odd-numbered pages.
            document.copy(document.pages[::2]).write_pdf('odd_pages.pdf')
            document.copy(document.pages[1::2]).write_pdf('even_pages.pdf')

        Combine multiple documents into one PDF file,
        using metadata from the first::

            all_pages = [p for doc in documents for p in doc.pages]
            documents[0].copy(all_pages).write_pdf('combined.pdf')

        """
        if pages == 'all':
            pages = self.pages
        elif not isinstance(pages, list):
            pages = list(pages)
        return type(self)(
            pages, self.metadata, self.url_fetcher, self.font_config)

    def make_bookmark_tree(self, scale=1, transform_pages=False):
        """Make a tree of all bookmarks in the document.

        :param float scale:
            Zoom scale.
        :param bool transform_pages:
            A boolean defining whether the default PDF page transformation
            matrix has to be applied to bookmark coordinates, setting the
            bottom-left corner as the origin.
        :return: A list of bookmark subtrees.
            A subtree is ``(label, target, children, state)``. ``label`` is
            a string, ``target`` is ``(page_number, x, y)``  and ``children``
            is a list of child subtrees.

        """
        root = []
        # At one point in the document, for each "output" depth, how much
        # to add to get the source level (CSS values of bookmark-level).
        # E.g. with <h1> then <h3>, level_shifts == [0, 1]
        # 1 means that <h3> has depth 3 - 1 = 2 in the output.
        skipped_levels = []
        last_by_depth = [root]
        previous_level = 0
        for page_number, page in enumerate(self.pages):
            if transform_pages:
                matrix = Matrix(a=scale, d=-scale, f=page.height * scale)
            else:
                matrix = Matrix(a=scale, d=scale)
            previous_level = make_page_bookmark_tree(
                page, skipped_levels, last_by_depth, previous_level,
                page_number, matrix)
        return root

    def write_pdf(self, target=None, zoom=1, finisher=None, **options):
        """Paint the pages in a PDF file, with metadata.

        :type target:
            :class:`str`, :class:`pathlib.Path` or :term:`file object`
        :param target:
            A filename where the PDF file is generated, a file object, or
            :obj:`None`.
        :param float zoom:
            The zoom factor in PDF units per CSS units.  **Warning**:
            All CSS units are affected, including physical units like
            ``cm`` and named sizes like ``A4``.  For values other than
            1, the physical CSS units will thus be "wrong".
        :type finisher: :term:`callable`
        :param finisher:
            A finisher function or callable that accepts the document and a
            :class:`pydyf.PDF` object as parameters. Can be passed to perform
            post-processing on the PDF right before the trailer is written.
        :param options:
            The ``options`` parameter includes by default the
            :data:`weasyprint.DEFAULT_OPTIONS` values.
        :returns:
            The PDF as :obj:`bytes` if ``target`` is not provided or
            :obj:`None`, otherwise :obj:`None` (the PDF is written to
            ``target``).

        """
        new_options = DEFAULT_OPTIONS.copy()
        new_options.update(options)
        options = new_options
        pdf = generate_pdf(self, target, zoom, **options)

        identifier = options['pdf_identifier']
        compress = not options['uncompressed_pdf']

        if finisher:
            finisher(self, pdf)

        if target is None:
            output = io.BytesIO()
            pdf.write(output, pdf.version, identifier, compress)
            return output.getvalue()

        if hasattr(target, 'write'):
            pdf.write(target, pdf.version, identifier, compress)
        else:
            with open(target, 'wb') as fd:
                pdf.write(fd, pdf.version, identifier, compress)