Source code for indieweb_utils.posts.discovery

import re
from typing import List, Tuple
from urllib import parse as url_parse

import mf2py
import requests
from bs4 import BeautifulSoup

from ..parsing.parse import get_parsed_mf2_data, get_soup
from ..utils.urls import _is_http_url, canonicalize_url

# This regex identifies permashortlink citations in the form of (example.com slug)
# Permashortlink citations may be used as a link to a post that does not contain a hyperlink
# Checking for a permashortlink citation is a step in the Original Post Discovery algorithm
# More on permashortlink citations: https://indieweb.org/permashortcitation
PERMASHORTLINK_CITATION_BRACKET_MATCHING = r"\((.*?)\)"


class PostDiscoveryError(Exception):
    pass


class PostTypeFormattingError(Exception):
    pass


def _process_candidate_url(candidate_url: str, posse_permalink: str, parsed_post: BeautifulSoup) -> str:
    try:
        request = requests.get(candidate_url, timeout=5)
    except requests.exceptions.RequestException:
        raise PostDiscoveryError("Could not get candidate url")

    parsed_candidate_url = BeautifulSoup(request.text, "lxml")

    all_hyperlinks = parsed_candidate_url.select("a")

    posse_domain = url_parse.urlsplit(posse_permalink).netloc

    for link in all_hyperlinks:
        if "u-syndication" in link.get("class"):
            url_to_check = link.get("href")

            original_post_url = _syndication_check(url_to_check, posse_permalink, candidate_url, posse_domain)

            if original_post_url:
                return original_post_url

    all_syndication_link_headers = parsed_post.select("link[rel='syndication']")

    for header in all_syndication_link_headers:
        if header.get("href") == posse_permalink:
            url_to_check = header.get("href")

            original_post_url = _syndication_check(url_to_check, posse_permalink, candidate_url, posse_domain)

            if original_post_url:
                return original_post_url

    return ""


def _check_for_link_in_post(last_text: BeautifulSoup) -> str:
    last_text = last_text[0].select("p")[-1]

    # if permashortlink citation
    # format = (url.com id)

    permashortlink_citation = re.search(PERMASHORTLINK_CITATION_BRACKET_MATCHING, last_text.text)

    if permashortlink_citation is not None:
        permashortlink = re.search(PERMASHORTLINK_CITATION_BRACKET_MATCHING, last_text.text)

    if permashortlink is not None:
        permashortlink_value = "http://" + permashortlink.group(0) + "/" + permashortlink.group(1)

        candidate_url = permashortlink_value
    else:
        # check for url at end
        split_text = last_text.text.split(" ")

        if _is_http_url(split_text[-1]):
            candidate_url = split_text[-1]
        else:
            candidate_url = ""

    return candidate_url


[docs]def discover_original_post(posse_permalink: str, soup: BeautifulSoup = None, html: str = "") -> str:
    """
    Find the original version of a post per the Original Post Discovery algorithm.

    refs: https://indieweb.org/original-post-discovery#Algorithm

    :param posse_permalink: The permalink of the post.
    :type posse_permalink: str
    :return: The original post permalink.
    :rtype: str

    Example:

    .. code-block:: python

        import indieweb_utils

        original_post_url = indieweb_utils.discover_original_post("https://example.com")

        print(original_post_url)

    :raises PostDiscoveryError: A candidate URL cannot be retrieved or when a specified
        post is not marked up with h-entry.
    """

    if soup is None:
        parsed_post = get_soup(html, posse_permalink)
    else:
        parsed_post = soup

    # Get the post h-entry

    post_h_entry = parsed_post.select(".h-entry")

    original_post_url = None

    if not post_h_entry:
        raise PostDiscoveryError("Could not find h-entry")

    post_h_entry = post_h_entry[0]

    # select with u-url and u-uid
    if post_h_entry.select(".u-url .u-uid"):
        original_post_url = post_h_entry.select(".u-url .u-uid")[0].get("href")
        return original_post_url

    canonical_links = parsed_post.select("link[rel='canonical']")

    if canonical_links:
        original_post_url = canonical_links[0].get("href")
        return original_post_url

    # look for text with see original anchor text

    for link in parsed_post.select("a"):
        if link.text.lower() == "see original".lower() and link.get("href"):
            original_post_url = link.get("href")

            return original_post_url

    candidate_url = None

    last_text = post_h_entry.select(".e-content")

    if last_text:
        candidate_url = _check_for_link_in_post(last_text)

    if candidate_url and candidate_url != "":
        post_url = _process_candidate_url(candidate_url, posse_permalink, parsed_post)

        if post_url != "":
            return post_url

    return ""


def _discover_h_card_from_author_page(author_url: str, rel_author: str) -> dict:
    new_h_card = mf2py.parse(url=author_url)

    # get rel me values from parsed object
    if new_h_card.get("rels") and new_h_card.get("rels").get("me"):
        rel_mes = new_h_card["rels"]["me"]
    else:
        rel_mes = []

    final_h_card = [e for e in new_h_card["items"] if e["type"] == "h-card"]

    for card in final_h_card:
        for j in card["items"]:
            if (
                j.get("type")
                and j.get("type") == ["h-card"]
                and j["properties"]["url"] == rel_author
                and j["properties"].get("uid") == j["properties"]["url"]
            ):
                h_card = j
                return h_card

            if j.get("type") and j.get("type") == ["h-card"] and j["properties"].get("url") in rel_mes:
                h_card = j
                return h_card

            if j.get("type") and j.get("type") == ["h-card"] and j["properties"]["url"] == rel_author:
                h_card = j
                return h_card

    return {}


[docs]def discover_author(url: str, html: str = "", parsed_mf2: mf2py.Parser = None) -> dict:
    """
    Discover the author of a post per the IndieWeb Authorship specification.

    :refs: https://indieweb.org/authorship-spec

    :param url: The URL of the post.
    :type url: str
    :param page_contents: The optional page contents to use.
        Specifying this value prevents a HTTP request being made to the URL.
    :type page_contents: str
    :return: A h-card of the post.
    :rtype: dict

    .. code-block:: python

        import indieweb_utils
        import mf2py

        url = "https://jamesg.blog/2022/01/28/integrated-indieweb-services/"

        parsed_mf2 = mf2py.parse(url=url)

        post_author = indieweb_utils.discover_author(
            h_entry
        )

        print(post_author) # A h-card object representing the post author.
    """

    full_page = get_parsed_mf2_data(parsed_mf2, html, url)

    preliminary_author = None

    h_entry = [e for e in full_page["items"] if e["type"] == ["h-entry"]]

    if h_entry and h_entry[0]["properties"].get("author"):
        preliminary_author = h_entry[0]["properties"]["author"][0]

    h_feed = [e for e in full_page["items"] if e["type"] == ["h-feed"]]

    if h_feed and h_feed[0]["properties"].get("author"):
        preliminary_author = h_entry[0]["properties"]["author"][0]

    author_page_url = None

    if preliminary_author and type(preliminary_author) == str:
        if preliminary_author.startswith("https://"):
            # author is url, further processing needed
            author_page_url = preliminary_author
        else:
            # author is name
            return {
                "type": ["h-card"],
                "properties": {
                    "name": [preliminary_author],
                    "url": [url],
                },
            }

    if preliminary_author and type(preliminary_author) == dict:
        # author is h-card so the value can be returned
        return preliminary_author

    # if rel=author, look for h-card on the rel=author link
    if author_page_url is None and h_entry and h_entry[0].get("rels") and h_entry[0]["rels"].get("author"):
        rel_author = h_entry[0]["rels"]["author"]

        if rel_author:
            author_page_url = rel_author[0]

    # canonicalize author page
    if author_page_url:
        domain = url_parse.urlsplit(url).netloc

        author_url = canonicalize_url(author_page_url, domain)

        h_card = _discover_h_card_from_author_page(author_url, rel_author)

        return h_card

    return {}


[docs]def get_post_type(h_entry: dict = {}, custom_properties: List[Tuple[str, str]] = []) -> str:
    """
    Return the type of a h-entry per the Post Type Discovery algorithm.

    :param h_entry: The h-entry whose type to retrieve.
    :type h_entry: dict
    :param custom_properties: The optional custom properties to use for the Post Type Discovery algorithm.
    :type custom_properties: list[tuple[str, str]]
    :return: The type of the h-entry.
    :rtype: str

    Here is an example of the function in action:

    .. code-block:: python

        import indieweb_utils
        import mf2py

        url = "https://jamesg.blog/2022/01/28/integrated-indieweb-services/"

        parsed_mf2 = mf2py.parse(url=url)

        h_entry = [e for e in parsed_mf2["items"] if e["type"] == ["h-entry"]][0]

        post_type = indieweb_utils.get_post_type(
            h_entry
        )

        print(post_type) # article

    :raises PostTypeFormattingError: Raised when you specify a custom_properties tuple in the wrong format.
    """

    post = h_entry.get("properties")

    if post is None:
        return "unknown"

    values_to_check = [
        ("rsvp", "rsvp"),
        ("in-reply-to", "reply"),
        ("repost-of", "repost"),
        ("like-of", "like"),
        ("video", "video"),
        ("photo", "photo"),
        ("summary", "summary"),
    ]

    for prop in custom_properties:
        if len(prop) == 2 and isinstance(prop, tuple) and isinstance(prop[0], str) and isinstance(prop[1], str):
            values_to_check.append(prop)
        else:
            raise PostTypeFormattingError("custom_properties must be a list of tuples")

    for item in values_to_check:
        if post.get(item[0]):
            return item[1]

    post_type = "note"

    if post.get("name") is None or post.get("name")[0] == "":
        return post_type

    title = post.get("name")[0].strip().replace("\n", " ").replace("\r", " ")

    # Default should be a list so we're never dealing with None
    content = post.get("content", [])

    if content:
        # Default should be an empty string, so we're never dealing with None
        text = content[0].get("text", "")
        html = content[0].get("html", "")

        if html or text:
            # Prefer to validate against html than text version of the content
            content_text = BeautifulSoup(html or text, "lxml").get_text()

            if content_text and not content_text.startswith(title):
                return "article"

    return post_type


def _syndication_check(url_to_check, posse_permalink, candidate_url, posse_domain):
    if url_to_check == posse_permalink:
        return candidate_url

    if url_to_check and url_parse.urlsplit(url_to_check).netloc == posse_domain:
        try:
            r = requests.get(url_to_check, timeout=10, allow_redirects=True)
        except requests.exceptions.RequestException:
            # handler will prevent exception due to timeout, if one occurs
            pass

        for url_item in r.history:
            if url_item.url == posse_permalink:
                return candidate_url

    return None