Source code for indieweb_utils.posts.discovery

import re
from typing import List, Tuple
from urllib import parse as url_parse

import mf2py
import requests
from bs4 import BeautifulSoup

from ..parsing.parse import get_parsed_mf2_data, get_soup
from ..utils.urls import _is_http_url, canonicalize_url

# This regex identifies permashortlink citations in the form of (example.com slug)
# Permashortlink citations may be used as a link to a post that does not contain a hyperlink
# Checking for a permashortlink citation is a step in the Original Post Discovery algorithm
# More on permashortlink citations: https://indieweb.org/permashortcitation
PERMASHORTLINK_CITATION_BRACKET_MATCHING = r"\((.*?)\)"


class PostDiscoveryError(Exception):
    pass


class PostTypeFormattingError(Exception):
    pass


def _process_candidate_url(candidate_url: str, posse_permalink: str, parsed_post: BeautifulSoup) -> str:
    try:
        request = requests.get(candidate_url, timeout=5)
    except requests.exceptions.RequestException:
        raise PostDiscoveryError("Could not get candidate url")

    parsed_candidate_url = BeautifulSoup(request.text, "lxml")

    all_hyperlinks = parsed_candidate_url.select("a")

    posse_domain = url_parse.urlsplit(posse_permalink).netloc

    for link in all_hyperlinks:
        if "u-syndication" in link.get("class"):
            url_to_check = link.get("href")

            original_post_url = _syndication_check(url_to_check, posse_permalink, candidate_url, posse_domain)

            if original_post_url:
                return original_post_url

    all_syndication_link_headers = parsed_post.select("link[rel='syndication']")

    for header in all_syndication_link_headers:
        if header.get("href") == posse_permalink:
            url_to_check = header.get("href")

            original_post_url = _syndication_check(url_to_check, posse_permalink, candidate_url, posse_domain)

            if original_post_url:
                return original_post_url

    return ""


def _check_for_link_in_post(last_text: BeautifulSoup) -> str:
    last_text = last_text[0].select("p")[-1]

    # if permashortlink citation
    # format = (url.com id)

    permashortlink_citation = re.search(PERMASHORTLINK_CITATION_BRACKET_MATCHING, last_text.text)

    if permashortlink_citation is not None:
        permashortlink = re.search(PERMASHORTLINK_CITATION_BRACKET_MATCHING, last_text.text)

    if permashortlink is not None:
        permashortlink_value = "http://" + permashortlink.group(0) + "/" + permashortlink.group(1)

        candidate_url = permashortlink_value
    else:
        # check for url at end
        split_text = last_text.text.split(" ")

        if _is_http_url(split_text[-1]):
            candidate_url = split_text[-1]
        else:
            candidate_url = ""

    return candidate_url


[docs]def discover_original_post(posse_permalink: str, soup: BeautifulSoup = None, html: str = "") -> str: """ Find the original version of a post per the Original Post Discovery algorithm. refs: https://indieweb.org/original-post-discovery#Algorithm :param posse_permalink: The permalink of the post. :type posse_permalink: str :return: The original post permalink. :rtype: str Example: .. code-block:: python import indieweb_utils original_post_url = indieweb_utils.discover_original_post("https://example.com") print(original_post_url) :raises PostDiscoveryError: A candidate URL cannot be retrieved or when a specified post is not marked up with h-entry. """ if soup is None: parsed_post = get_soup(html, posse_permalink) else: parsed_post = soup # Get the post h-entry post_h_entry = parsed_post.select(".h-entry") original_post_url = None if not post_h_entry: raise PostDiscoveryError("Could not find h-entry") post_h_entry = post_h_entry[0] # select with u-url and u-uid if post_h_entry.select(".u-url .u-uid"): original_post_url = post_h_entry.select(".u-url .u-uid")[0].get("href") return original_post_url canonical_links = parsed_post.select("link[rel='canonical']") if canonical_links: original_post_url = canonical_links[0].get("href") return original_post_url # look for text with see original anchor text for link in parsed_post.select("a"): if link.text.lower() == "see original".lower() and link.get("href"): original_post_url = link.get("href") return original_post_url candidate_url = None last_text = post_h_entry.select(".e-content") if last_text: candidate_url = _check_for_link_in_post(last_text) if candidate_url and candidate_url != "": post_url = _process_candidate_url(candidate_url, posse_permalink, parsed_post) if post_url != "": return post_url return ""
def _discover_h_card_from_author_page(author_url: str, rel_author: str) -> dict: new_h_card = mf2py.parse(url=author_url) # get rel me values from parsed object if new_h_card.get("rels") and new_h_card.get("rels").get("me"): rel_mes = new_h_card["rels"]["me"] else: rel_mes = [] final_h_card = [e for e in new_h_card["items"] if e["type"] == "h-card"] for card in final_h_card: for j in card["items"]: if ( j.get("type") and j.get("type") == ["h-card"] and j["properties"]["url"] == rel_author and j["properties"].get("uid") == j["properties"]["url"] ): h_card = j return h_card if j.get("type") and j.get("type") == ["h-card"] and j["properties"].get("url") in rel_mes: h_card = j return h_card if j.get("type") and j.get("type") == ["h-card"] and j["properties"]["url"] == rel_author: h_card = j return h_card return {}
[docs]def discover_author(url: str, html: str = "", parsed_mf2: mf2py.Parser = None) -> dict: """ Discover the author of a post per the IndieWeb Authorship specification. :refs: https://indieweb.org/authorship-spec :param url: The URL of the post. :type url: str :param page_contents: The optional page contents to use. Specifying this value prevents a HTTP request being made to the URL. :type page_contents: str :return: A h-card of the post. :rtype: dict .. code-block:: python import indieweb_utils import mf2py url = "https://jamesg.blog/2022/01/28/integrated-indieweb-services/" parsed_mf2 = mf2py.parse(url=url) post_author = indieweb_utils.discover_author( h_entry ) print(post_author) # A h-card object representing the post author. """ full_page = get_parsed_mf2_data(parsed_mf2, html, url) preliminary_author = None h_entry = [e for e in full_page["items"] if e["type"] == ["h-entry"]] if h_entry and h_entry[0]["properties"].get("author"): preliminary_author = h_entry[0]["properties"]["author"][0] h_feed = [e for e in full_page["items"] if e["type"] == ["h-feed"]] if h_feed and h_feed[0]["properties"].get("author"): preliminary_author = h_entry[0]["properties"]["author"][0] author_page_url = None if preliminary_author and type(preliminary_author) == str: if preliminary_author.startswith("https://"): # author is url, further processing needed author_page_url = preliminary_author else: # author is name return { "type": ["h-card"], "properties": { "name": [preliminary_author], "url": [url], }, } if preliminary_author and type(preliminary_author) == dict: # author is h-card so the value can be returned return preliminary_author # if rel=author, look for h-card on the rel=author link if author_page_url is None and h_entry and h_entry[0].get("rels") and h_entry[0]["rels"].get("author"): rel_author = h_entry[0]["rels"]["author"] if rel_author: author_page_url = rel_author[0] # canonicalize author page if author_page_url: domain = url_parse.urlsplit(url).netloc author_url = canonicalize_url(author_page_url, domain) h_card = _discover_h_card_from_author_page(author_url, rel_author) return h_card return {}
[docs]def get_post_type(h_entry: dict = {}, custom_properties: List[Tuple[str, str]] = []) -> str: """ Return the type of a h-entry per the Post Type Discovery algorithm. :param h_entry: The h-entry whose type to retrieve. :type h_entry: dict :param custom_properties: The optional custom properties to use for the Post Type Discovery algorithm. :type custom_properties: list[tuple[str, str]] :return: The type of the h-entry. :rtype: str Here is an example of the function in action: .. code-block:: python import indieweb_utils import mf2py url = "https://jamesg.blog/2022/01/28/integrated-indieweb-services/" parsed_mf2 = mf2py.parse(url=url) h_entry = [e for e in parsed_mf2["items"] if e["type"] == ["h-entry"]][0] post_type = indieweb_utils.get_post_type( h_entry ) print(post_type) # article :raises PostTypeFormattingError: Raised when you specify a custom_properties tuple in the wrong format. """ post = h_entry.get("properties") if post is None: return "unknown" values_to_check = [ ("rsvp", "rsvp"), ("in-reply-to", "reply"), ("repost-of", "repost"), ("like-of", "like"), ("video", "video"), ("photo", "photo"), ("summary", "summary"), ] for prop in custom_properties: if len(prop) == 2 and isinstance(prop, tuple) and isinstance(prop[0], str) and isinstance(prop[1], str): values_to_check.append(prop) else: raise PostTypeFormattingError("custom_properties must be a list of tuples") for item in values_to_check: if post.get(item[0]): return item[1] post_type = "note" if post.get("name") is None or post.get("name")[0] == "": return post_type title = post.get("name")[0].strip().replace("\n", " ").replace("\r", " ") # Default should be a list so we're never dealing with None content = post.get("content", []) if content: # Default should be an empty string, so we're never dealing with None text = content[0].get("text", "") html = content[0].get("html", "") if html or text: # Prefer to validate against html than text version of the content content_text = BeautifulSoup(html or text, "lxml").get_text() if content_text and not content_text.startswith(title): return "article" return post_type
def _syndication_check(url_to_check, posse_permalink, candidate_url, posse_domain): if url_to_check == posse_permalink: return candidate_url if url_to_check and url_parse.urlsplit(url_to_check).netloc == posse_domain: try: r = requests.get(url_to_check, timeout=10, allow_redirects=True) except requests.exceptions.RequestException: # handler will prevent exception due to timeout, if one occurs pass for url_item in r.history: if url_item.url == posse_permalink: return candidate_url return None