Source code for indieweb_utils.webmentions.discovery

import ipaddress
from dataclasses import dataclass
from typing import Dict, List
from urllib import parse as url_parse

import requests
from bs4 import BeautifulSoup

from ..utils.urls import _is_http_url

_WEBMENTION = "webmention"  # TODO: Move this to a constants file


@dataclass
class WebmentionDiscoveryResponse:
    endpoint: str


class TargetNotProvided(Exception):
    pass


class WebmentionEndpointNotFound(Exception):
    pass


class UnacceptableIPAddress(Exception):
    """
    Raised if an IP address for a webmention endpoint does not resolve on the public internet.

    Local, loopback, private, reserved, and multicast IP addresses are not acceptable.
    """


class LocalhostEndpointFound(Exception):
    pass


[docs]def discover_webmention_endpoint(target: str) -> WebmentionDiscoveryResponse:
    """
    Return the webmention endpoint for the given target.

    :param target: The target to discover the webmention endpoint for.
    :type target: str
    :return: The discovered webmention endpoint.
    :rtype: str

    .. code-block:: python

        import indieweb_utils

        target = "https://jamesg.blog/"

        webmention_endpoint = indieweb_utils.discover_webmention_endpoint(
            target
        )

        print(webmention_endpoint) # https://webmention.jamesg.blog/webmention

    :raises TargetNotProvided: Target is not provided.
    :raises WebmentionEndpointNotFound: Webmention endpoint is not found.
    :raises UnacceptableIPAddress: Endpoint does not connect to an accepted IP.
    :raises LocalhostEndpointFound: Discovered endpoint is equal to localhost.
    """
    if not target:
        raise TargetNotProvided("No target provided.")

    endpoints = discover_endpoints(target, [_WEBMENTION])

    endpoint = endpoints.get("webmention", None)

    if endpoint is None:
        raise WebmentionEndpointNotFound("No webmention endpoint could be found for this resource.")

    # verify if IP address is not allowed
    try:
        endpoint_as_ip = ipaddress.ip_address(endpoint)

        if (
            endpoint.is_private is True
            or endpoint.is_multicast is True
            or endpoint_as_ip.is_loopback is True
            or endpoint_as_ip.is_unspecified is True
            or endpoint_as_ip.is_reserved is True
            or endpoint_as_ip.is_link_local is True
        ):
            raise UnacceptableIPAddress("The endpoint does not connect to an accepted IP address.")
    except ValueError:
        pass

    if endpoint == "localhost":
        raise LocalhostEndpointFound("The endpoint is localhost.")

    if endpoint == "":
        endpoint = target

    valid_starts = ("http://", "https://", "/")

    if not any(endpoint.startswith(valid_start) for valid_start in valid_starts):
        endpoint = "/".join(target.split("/")[:-1]) + "/" + endpoint

    if endpoint.startswith("/"):
        endpoint = "https://" + url_parse.urlsplit(target).scheme + endpoint

    return WebmentionDiscoveryResponse(endpoint=endpoint)


[docs]def discover_endpoints(url: str, headers_to_find: List[str], request: requests.Response = None):
    """
    Return a dictionary of specified endpoint locations for the given URL, if available.

    :param url: The URL to discover endpoints for.
    :type url: str
    :param headers_to_find: The headers to find.
        Values you may want to use include: microsub, micropub, token_endpoint,
        authorization_endpoint, subscribe.
    :type headers_to_find: dict[str, str]
    :return: The discovered endpoints.
    :rtype: dict[str, str]

    .. code-block:: python

        import indieweb_utils
        import requests

        url = "https://jamesg.blog/"

        # find the microsub rel link on a web page
        headers_to_find = ["microsub"]

        endpoints = indieweb_utils.discover_endpoints(
            url
        )

        print(endpoints) # {'microsub': 'https://aperture.p3k.io/'}

    :raises requests.exceptions.RequestException: Error raised while making the network request to discover endpoints.
    """
    response: Dict[str, str] = {}

    if request:
        endpoint_request = request
    else:
        try:
            endpoint_request = requests.get(url, timeout=5)
        except requests.exceptions.RequestException:
            raise Exception("Could not connect to the specified URL.")

    link_headers = _find_links_in_headers(headers=endpoint_request.headers, target_headers=headers_to_find)

    for header in link_headers:
        response[header] = link_headers[header]["url"]

    response.update(_find_links_html(body=endpoint_request.text, target_headers=headers_to_find))
    return response


def _find_links_in_headers(*, headers, target_headers: List[str]) -> Dict[str, Dict[str, str]]:
    """Return a dictionary { rel: { url: 'url', mime_type: 'mime_type' } } containing the target headers."""
    found: Dict[str, Dict[str, str]] = {}
    links = headers.get("link")
    if links:
        # [{'url': 'https://micropub.jamesg.blog/micropub', 'rel': 'micropub'} ]
        parsed_link_headers: List[Dict[str, str]] = requests.utils.parse_header_links(links)
    else:
        return found

    for header in parsed_link_headers:
        url = header.get("url", "")
        rel = header.get("rel", "")
        mime_type = header.get("type", "")
        if _is_http_url(url) and rel in target_headers:
            found[rel] = {
                "url": url,
                "mime_type": mime_type,
            }

    # Add check for x-pingback header
    if "x-pingback" in target_headers:
        pingback_url = headers.get("x-pingback")

        if _is_http_url(pingback_url):
            # assign as "pingback" key in dictionary
            found["pingback"] = {
                "url": url,
                "mime_type": "",
            }

    return found


def _find_links_html(*, body: str, target_headers: List[str]) -> Dict[str, str]:
    soup = BeautifulSoup(body, "html.parser")
    found: Dict[str, str] = {}

    for link in soup.find_all("link"):
        try:
            rel = link.get("rel", [])[0]
            href = link.get("href")
        except IndexError:
            continue
        if _is_http_url(href) and rel in target_headers:
            found[rel] = href
    return found