Source code for indieweb_utils.feeds.discovery

import dataclasses
from typing import Dict, List, Optional, Tuple
from urllib import parse as url_parse

import mf2py
import requests
from bs4 import BeautifulSoup

from ..utils.urls import _is_http_url, canonicalize_url
from ..webmentions.discovery import _find_links_in_headers


[docs]@dataclasses.dataclass
class FeedUrl:
    url: str
    mime_type: str
    title: str


def _get_page_feed_contents(url: str, html: str) -> Tuple[requests.Response, str]:
    if html:
        try:
            web_page_request = requests.head(url, timeout=10, allow_redirects=True)
        except requests.RequestException:
            raise Exception("Request to retrieve URL did not return a valid response.")

    if not html:
        try:
            web_page_request = requests.get(url, timeout=10, allow_redirects=True)
        except requests.RequestException:
            raise Exception("Request to retrieve URL did not return a valid response.")
        else:
            html = web_page_request.text

    return web_page_request, html


[docs]def discover_web_page_feeds(url: str, user_mime_types: Optional[List[str]] = None, html: str = "") -> List[FeedUrl]:
    """
    Get all feeds on a web page.

    :param url: The URL of the page whose associated feeds you want to retrieve.
    :type url: str
    :param user_mime_types: A list of mime types whose associated feeds you want to retrieve.
    :type user_mime_types: Optional[List[str]]
    :param html: A string with the HTML on a page.
    :type html: str
    :return: A list of FeedUrl objects.
    :rtype: List[FeedUrl]

    Example:

    .. code-block:: python

        import indieweb_utils

        url = "https://jamesg.blog/"

        feeds = indieweb_utils.discover_web_page_feeds(url)

        # print the url of all feeds to the console
        for f in feeds:
            print(f.url)
    """
    user_mime_types = user_mime_types or []

    if not _is_http_url(url):
        url = "https://" + url
    elif url.startswith("//"):
        url = "https:" + url

    web_page_request, html = _get_page_feed_contents(url, html)

    soup = BeautifulSoup(html, "lxml")

    # check for presence of mf2 hfeed
    h_feed = soup.find_all(class_="h-feed")
    page_title = soup.find("title")

    page_domain = url_parse.urlsplit(url).netloc

    valid_mime_types = {
        "application/rss+xml",
        "application/atom+xml",
        "application/rdf+xml",
        "application/xml",
        "application/json",
        "application/mf2+json",
        "application/atom+xml",
        "application/feed+json",
        "application/jf2feed_json",
    }

    feeds: List[FeedUrl] = []

    for mime_type in valid_mime_types.union(user_mime_types):
        if soup.find("link", rel="alternate", type=mime_type):
            feed_title = soup.find("link", rel="alternate", type=mime_type).get("title")
            feed_url = canonicalize_url(soup.find("link", rel="alternate", type=mime_type).get("href"), page_domain)

            feeds.append(FeedUrl(url=feed_url, mime_type=mime_type, title=feed_title))

    if h_feed:
        feeds.append(FeedUrl(url=url, mime_type="text/html", title=page_title.text))

    http_headers = _find_links_in_headers(headers=web_page_request.headers, target_headers=["alternate", "feed"])

    for rel, item in http_headers.items():
        feed_mime_type = item.get("mime_type", "")

        feed_title = http_headers.get(rel, "")
        feed_url = canonicalize_url(url, page_domain)

        feeds.append(FeedUrl(url=feed_url, mime_type=feed_mime_type, title=feed_title))

    return feeds


[docs]def discover_h_feed(url: str, html: str = "") -> Dict:
    """
    Find the main h-feed that represents a web page as per the h-feed Discovery algorithm.

    refs: https://microformats.org/wiki/h-feed#Discovery

    :param url: The URL of the page whose associated feeds you want to retrieve.
    :type url: str
    :param html: The HTML of a page whose feeds you want to retrieve
    :type html: str
    :return: The h-feed data.
    :rtype: dict

    Example:

    .. code-block:: python

        import indieweb_utils

        url = "https://jamesg.blog/"

        hfeed = indieweb_utils.discover_h_feed(url)

        print(hfeed)
    """

    if html:
        parsed_main_page_mf2 = mf2py.parse(doc=html)
    else:
        parsed_main_page_mf2 = mf2py.parse(url=url)

    all_page_feeds = discover_web_page_feeds(url)

    get_mf2_feed = [feed for feed in all_page_feeds if feed.mime_type == "text/mf2+html"]

    if len(get_mf2_feed) > 0:
        feed = get_mf2_feed[0].url

        parsed_feed = mf2py.parse(url=feed)

        h_feed = [item for item in parsed_feed["items"] if item.get("type") and item.get("type")[0] == "h-feed"]

        if h_feed:
            return h_feed[0]

    h_feed = [item for item in parsed_main_page_mf2["items"] if item.get("type") and item.get("type")[0] == "h-feed"]

    if h_feed:
        return h_feed[0]

    return {}