Source code for indieweb_utils.posts.page_name

import mf2py
import requests
from bs4 import BeautifulSoup

from ..parsing.parse import RequestError, get_soup


[docs]def get_page_name(url: str, html: str = None, soup: BeautifulSoup = None) -> str:
    """
    Retrieve the name of a page using the Page Name Discovery algorithm.

    :refs: https://indieweb.org/page-name-discovery

    :param url: The url of the page whose title you want to retrieve.
    :type url: str
    :param html: The HTML of the page whose title you want to retrieve.
    :type html: str
    :return: A representative "name" for the page.
    :rtype: str

    Example:

    .. code-block:: python

        import indieweb_utils

        page_name = indieweb_utils.get_page_name("https://jamesg.blog")

        print(page_name) # "Home | James' Coffee Blog"
    """

    parsed_mf2_tree = None

    if html:
        soup = get_soup(html)

    if soup is None:
        try:
            contents = requests.get(url, timeout=10)
        except requests.exceptions.RequestException:
            raise RequestError("Request to retrieve URL did not return a valid response.")

        soup = BeautifulSoup(contents.text, "html.parser")

        html = contents.text

    parsed_mf2_tree = mf2py.parse(doc=html)

    # only search the top level of the tree
    # representative h-entries, which is what this function looks for, should not be lower down
    for item in parsed_mf2_tree["items"]:
        if item["type"][0] != "h-entry":
            continue

        name = item["properties"].get("name")

        if name and len(name) > 0:
            return name[0]

        summary = item["properties"].get("summary")

        if summary and len(summary) > 0:
            return summary[0]

    page_title = soup.title

    if page_title:
        return page_title.text

    return "Untitled"