Source code for indieweb_utils.webmentions.discovery

import ipaddress
from dataclasses import dataclass
from typing import Dict, List
from urllib import parse as url_parse

import requests
from bs4 import BeautifulSoup

from ..utils.urls import _is_http_url

_WEBMENTION = "webmention"  # TODO: Move this to a constants file


@dataclass
class WebmentionDiscoveryResponse:
    endpoint: str


class TargetNotProvided(Exception):
    pass


class WebmentionEndpointNotFound(Exception):
    pass


class UnacceptableIPAddress(Exception):
    """
    Raised if an IP address for a webmention endpoint does not resolve on the public internet.

    Local, loopback, private, reserved, and multicast IP addresses are not acceptable.
    """


class LocalhostEndpointFound(Exception):
    pass


[docs]def discover_webmention_endpoint(target: str) -> WebmentionDiscoveryResponse: """ Return the webmention endpoint for the given target. :param target: The target to discover the webmention endpoint for. :type target: str :return: The discovered webmention endpoint. :rtype: str .. code-block:: python import indieweb_utils target = "https://jamesg.blog/" webmention_endpoint = indieweb_utils.discover_webmention_endpoint( target ) print(webmention_endpoint) # https://webmention.jamesg.blog/webmention :raises TargetNotProvided: Target is not provided. :raises WebmentionEndpointNotFound: Webmention endpoint is not found. :raises UnacceptableIPAddress: Endpoint does not connect to an accepted IP. :raises LocalhostEndpointFound: Discovered endpoint is equal to localhost. """ if not target: raise TargetNotProvided("No target provided.") endpoints = discover_endpoints(target, [_WEBMENTION]) endpoint = endpoints.get("webmention", None) if endpoint is None: raise WebmentionEndpointNotFound("No webmention endpoint could be found for this resource.") # verify if IP address is not allowed try: endpoint_as_ip = ipaddress.ip_address(endpoint) if ( endpoint.is_private is True or endpoint.is_multicast is True or endpoint_as_ip.is_loopback is True or endpoint_as_ip.is_unspecified is True or endpoint_as_ip.is_reserved is True or endpoint_as_ip.is_link_local is True ): raise UnacceptableIPAddress("The endpoint does not connect to an accepted IP address.") except ValueError: pass if endpoint == "localhost": raise LocalhostEndpointFound("The endpoint is localhost.") if endpoint == "": endpoint = target valid_starts = ("http://", "https://", "/") if not any(endpoint.startswith(valid_start) for valid_start in valid_starts): endpoint = "/".join(target.split("/")[:-1]) + "/" + endpoint if endpoint.startswith("/"): endpoint = "https://" + url_parse.urlsplit(target).scheme + endpoint return WebmentionDiscoveryResponse(endpoint=endpoint)
[docs]def discover_endpoints(url: str, headers_to_find: List[str], request: requests.Response = None): """ Return a dictionary of specified endpoint locations for the given URL, if available. :param url: The URL to discover endpoints for. :type url: str :param headers_to_find: The headers to find. Values you may want to use include: microsub, micropub, token_endpoint, authorization_endpoint, subscribe. :type headers_to_find: dict[str, str] :return: The discovered endpoints. :rtype: dict[str, str] .. code-block:: python import indieweb_utils import requests url = "https://jamesg.blog/" # find the microsub rel link on a web page headers_to_find = ["microsub"] endpoints = indieweb_utils.discover_endpoints( url ) print(endpoints) # {'microsub': 'https://aperture.p3k.io/'} :raises requests.exceptions.RequestException: Error raised while making the network request to discover endpoints. """ response: Dict[str, str] = {} if request: endpoint_request = request else: try: endpoint_request = requests.get(url, timeout=5) except requests.exceptions.RequestException: raise Exception("Could not connect to the specified URL.") link_headers = _find_links_in_headers(headers=endpoint_request.headers, target_headers=headers_to_find) for header in link_headers: response[header] = link_headers[header]["url"] response.update(_find_links_html(body=endpoint_request.text, target_headers=headers_to_find)) return response
def _find_links_in_headers(*, headers, target_headers: List[str]) -> Dict[str, Dict[str, str]]: """Return a dictionary { rel: { url: 'url', mime_type: 'mime_type' } } containing the target headers.""" found: Dict[str, Dict[str, str]] = {} links = headers.get("link") if links: # [{'url': 'https://micropub.jamesg.blog/micropub', 'rel': 'micropub'} ] parsed_link_headers: List[Dict[str, str]] = requests.utils.parse_header_links(links) else: return found for header in parsed_link_headers: url = header.get("url", "") rel = header.get("rel", "") mime_type = header.get("type", "") if _is_http_url(url) and rel in target_headers: found[rel] = { "url": url, "mime_type": mime_type, } # Add check for x-pingback header if "x-pingback" in target_headers: pingback_url = headers.get("x-pingback") if _is_http_url(pingback_url): # assign as "pingback" key in dictionary found["pingback"] = { "url": url, "mime_type": "", } return found def _find_links_html(*, body: str, target_headers: List[str]) -> Dict[str, str]: soup = BeautifulSoup(body, "html.parser") found: Dict[str, str] = {} for link in soup.find_all("link"): try: rel = link.get("rel", [])[0] href = link.get("href") except IndexError: continue if _is_http_url(href) and rel in target_headers: found[rel] = href return found