Source code for indieweb_utils.indieauth.relmeauth

from typing import List, Set
from urllib.parse import urlparse as parse_url

import mf2py
import requests
from bs4 import BeautifulSoup

from ..parsing.parse import get_parsed_mf2_data
from ..utils.urls import canonicalize_url


[docs]def get_valid_relmeauth_links(
    url: str, require_rel_me_link_back: bool = True, html: str = None, parsed_mf2: mf2py.Parser = None
) -> List[str]:
    """
    Get the valid links on a page that point back to a rel=me URL per RelMeAuth.

    refs: https://indieweb.org/RelMeAuth
    refs: http://microformats.org/wiki/RelMeAuth

    :url: The url to parse.
    :type url: str
    :require_rel_me_link_back: Whether to require a rel=me link back to the specified URL.
        If this property is set to False, this function will return all sites that link back
        to the specified URL, even if they do not have a rel=me attribute. If this property is
        set to True (the default), this function will only return sites that have a rel=me link
        pointing back to your URL.
    :type require_rel_me_link_back: bool
    :return: The valid relmeauth links.
    :rtype: dict

    Example:

    .. code-block:: python

        import indieweb_utils

        url = "https://jamesg.blog"

        valid_relmeauth_links = indieweb_utils.get_valid_relmeauth_links(url)

        for link in valid_relmeauth_links:
            print(link)
    """

    domain = parse_url(url).netloc

    canonical_url = canonicalize_url(url, domain).strip("/")

    mf2_data = get_parsed_mf2_data(parsed_mf2, html, canonical_url)

    rel_me_links = [canonicalize_url(url, domain) for url in mf2_data["rels"].get("me", [])]

    valid_rel_me_links: Set[str] = set()

    if require_rel_me_link_back is False:
        return list(set(rel_me_links))

    for link in rel_me_links:
        try:
            link_valid = requests.get(link, timeout=5)
        except requests.exceptions.RequestException:
            continue

        if link_valid.status_code != 200:
            continue

        parsed_page = BeautifulSoup(link_valid.text, "html.parser")

        page_links = parsed_page.find_all("a") + parsed_page.find_all("link")

        link_domain = parse_url(link).netloc

        for item in page_links:
            # check if link is a rel me link
            # if it is, add the link to the list of valid rel me links

            if not item.get("rel") and require_rel_me_link_back is True:
                continue

            if "me" not in item.get("rel", "") and require_rel_me_link_back is True:
                continue

            if item.get("href") == canonical_url:
                canonical_link = canonicalize_url(link, link_domain)

                valid_rel_me_links.add(canonical_link)

    return list(valid_rel_me_links)