Source code for indieweb_utils.feeds.discovery

import dataclasses
from typing import Dict, List, Optional, Tuple
from urllib import parse as url_parse

import mf2py
import requests
from bs4 import BeautifulSoup

from ..utils.urls import _is_http_url, canonicalize_url
from ..webmentions.discovery import _find_links_in_headers


[docs]@dataclasses.dataclass class FeedUrl: url: str mime_type: str title: str
def _get_page_feed_contents(url: str, html: str) -> Tuple[requests.Response, str]: if html: try: web_page_request = requests.head(url, timeout=10, allow_redirects=True) except requests.RequestException: raise Exception("Request to retrieve URL did not return a valid response.") if not html: try: web_page_request = requests.get(url, timeout=10, allow_redirects=True) except requests.RequestException: raise Exception("Request to retrieve URL did not return a valid response.") else: html = web_page_request.text return web_page_request, html
[docs]def discover_web_page_feeds(url: str, user_mime_types: Optional[List[str]] = None, html: str = "") -> List[FeedUrl]: """ Get all feeds on a web page. :param url: The URL of the page whose associated feeds you want to retrieve. :type url: str :param user_mime_types: A list of mime types whose associated feeds you want to retrieve. :type user_mime_types: Optional[List[str]] :param html: A string with the HTML on a page. :type html: str :return: A list of FeedUrl objects. :rtype: List[FeedUrl] Example: .. code-block:: python import indieweb_utils url = "https://jamesg.blog/" feeds = indieweb_utils.discover_web_page_feeds(url) # print the url of all feeds to the console for f in feeds: print(f.url) """ user_mime_types = user_mime_types or [] if not _is_http_url(url): url = "https://" + url elif url.startswith("//"): url = "https:" + url web_page_request, html = _get_page_feed_contents(url, html) soup = BeautifulSoup(html, "lxml") # check for presence of mf2 hfeed h_feed = soup.find_all(class_="h-feed") page_title = soup.find("title") page_domain = url_parse.urlsplit(url).netloc valid_mime_types = { "application/rss+xml", "application/atom+xml", "application/rdf+xml", "application/xml", "application/json", "application/mf2+json", "application/atom+xml", "application/feed+json", "application/jf2feed_json", } feeds: List[FeedUrl] = [] for mime_type in valid_mime_types.union(user_mime_types): if soup.find("link", rel="alternate", type=mime_type): feed_title = soup.find("link", rel="alternate", type=mime_type).get("title") feed_url = canonicalize_url(soup.find("link", rel="alternate", type=mime_type).get("href"), page_domain) feeds.append(FeedUrl(url=feed_url, mime_type=mime_type, title=feed_title)) if h_feed: feeds.append(FeedUrl(url=url, mime_type="text/html", title=page_title.text)) http_headers = _find_links_in_headers(headers=web_page_request.headers, target_headers=["alternate", "feed"]) for rel, item in http_headers.items(): feed_mime_type = item.get("mime_type", "") feed_title = http_headers.get(rel, "") feed_url = canonicalize_url(url, page_domain) feeds.append(FeedUrl(url=feed_url, mime_type=feed_mime_type, title=feed_title)) return feeds
[docs]def discover_h_feed(url: str, html: str = "") -> Dict: """ Find the main h-feed that represents a web page as per the h-feed Discovery algorithm. refs: https://microformats.org/wiki/h-feed#Discovery :param url: The URL of the page whose associated feeds you want to retrieve. :type url: str :param html: The HTML of a page whose feeds you want to retrieve :type html: str :return: The h-feed data. :rtype: dict Example: .. code-block:: python import indieweb_utils url = "https://jamesg.blog/" hfeed = indieweb_utils.discover_h_feed(url) print(hfeed) """ if html: parsed_main_page_mf2 = mf2py.parse(doc=html) else: parsed_main_page_mf2 = mf2py.parse(url=url) all_page_feeds = discover_web_page_feeds(url) get_mf2_feed = [feed for feed in all_page_feeds if feed.mime_type == "text/mf2+html"] if len(get_mf2_feed) > 0: feed = get_mf2_feed[0].url parsed_feed = mf2py.parse(url=feed) h_feed = [item for item in parsed_feed["items"] if item.get("type") and item.get("type")[0] == "h-feed"] if h_feed: return h_feed[0] h_feed = [item for item in parsed_main_page_mf2["items"] if item.get("type") and item.get("type")[0] == "h-feed"] if h_feed: return h_feed[0] return {}