Source code for indieweb_utils.utils.url_summary

import re
from urllib.parse import urlparse

from .url_summary_templates import URL_SUMMARY_TEMPLATES
from .urls import canonicalize_url


class InvalidURL(Exception):
    """
    The provided URL is incorrectly formatted.
    """

    pass


[docs]def get_url_summary(url: str, custom_templates: list = None): """ Return a text summary for given `url`. :param url: The URL to summarize. :param custom_templates: A list of tuples with patterns against which to check when generating a summary associated with results to return. :return: A summary of the URL. :rtype: str .. code-block:: python import indieweb_utils # a dictionary of custom patterns against which to match during the lookup custom_properties = { "jamesg.blog": [ (r"coffee/maps/(?P<location>.+)", "A map of {location} coffee shops on jamesg.blog") ] } summary = indieweb_utils.get_summary("https://github.com/capjamesg/indieweb-utils/pulls/1") print(summary) # "A comment on a pull request in the indieweb-utils GitHub repository" summary = indieweb_utils.get_summary("https://jamesg.blog/coffee/maps/london") print(summary) # "A map of london coffee shops on jamesg.blog" """ url = canonicalize_url(url) if custom_templates is None: custom_templates = [] parsed_url = urlparse(url) domain = parsed_url.netloc domain = domain.lstrip("www.") if not domain: raise InvalidURL("The provided URL is incorrectly formatted.") for pattern, summary in URL_SUMMARY_TEMPLATES.get(domain, []) + custom_templates: if match := re.match(pattern, parsed_url.path.lstrip("/")): return summary.format(**match.groupdict()) return "A post by " + domain