Source code for indieweb_utils.utils.urls

from urllib import parse as url_parse


[docs]def canonicalize_url(url: str, domain: str = "", full_url: str = "", protocol: str = "https") -> str:
    """
    Return a canonical URL for the given URL.

    :param url: The URL to canonicalize.
    :type url: str
    :param domain: The domain to use for the canonical URL.
    :type domain: str
    :param full_url: Optional full URL to use for the canonical URL.
    :type full_url: str or None
    :param protocol: Optional protocol to use for the canonical URL.
    :type protocol: str or None
    :return: The canonical URL.
    :rtype: str

    .. code-block:: python

        import indieweb_utils

        url = "/contact"
        domain = "jamesg.blog"
        protocol = "https"

        endpoints = indieweb_utils.canonicalize_url(
            url, domain, protocol=protocol
        )

        print(webmention_endpoint) # https://jamesg.blog/contact/
    """

    if _is_http_url(url):
        domain = url_parse.urlsplit(url).netloc

        # remove port from domain

        domain = domain.split(":")[0]

        protocol = url_parse.urlsplit(url).scheme

        return protocol + "://" + domain + "/" + "/".join(url.split("/")[3:])

    current_protocol = url_parse.urlsplit(url).scheme

    # this will preserve links like irc:// and mailto:
    if current_protocol:
        return url

    if ":" in domain:
        text_before_port = domain.split(":")[0]

        text_after_port = domain.split(":")[1].split("/")[0]

        domain = text_before_port + "/" + text_after_port

    final_result = ""

    if url.startswith("//"):
        final_result = protocol + ":" + domain.strip() + "/" + url
    elif url.startswith("/"):
        final_result = protocol + "://" + domain.strip() + "/" + url
    elif url.startswith("./"):
        final_result = full_url + url[1:]
    elif url.startswith("../"):
        final_result = protocol + "://" + domain.strip() + "/" + url[3:]
    else:
        final_result = protocol + "://" + url

    # replace ../ throughout url

    url_after_replacing_dots = ""

    to_check = final_result.replace(domain, "").replace(protocol + "://", "")

    for url_item in to_check.split("/"):
        if url_item == "..":
            # directory before ../
            directory = url_after_replacing_dots.split("/")[-1]
            url_after_replacing_dots = url_after_replacing_dots.replace(directory, "")
        else:
            url_after_replacing_dots += "/" + url_item

    url_after_replacing_dots = url_after_replacing_dots.lstrip("/")

    # replace ./ throughout url

    url_after_replacing_dots = url_after_replacing_dots.replace("./", "/")

    final_url = protocol + "://" + domain + "/" + url_after_replacing_dots.lstrip("/")

    return final_url


def _is_http_url(url: str) -> bool:
    """
    Determine if URL is http or not
    """
    return url_parse.urlsplit(url).scheme in ["http", "https"]


[docs]def remove_tracking_params(url: str, custom_params: list) -> str:
    """
    Remove all UTM tracking parameters from a URL.

    :param url: The URL to remove tracking parameters from.
    :type url: str
    :param custom_params: A list of custom parameters to remove.
    :type custom_params: list
    :return: The URL without tracking parameters.
    :rtype: str

    Example:

    .. code-block:: python

        import indieweb_utils

        url = "https://jamesg.blog/indieweb/?utm_source=twitter&utm_medium=social&utm_campaign=webmention"

        url_without_tracking = indieweb_utils.remove_tracking_params(url)

        print(url_without_tracking) # https://jamesg.blog/indieweb/
    """

    parsed_url = url_parse.urlparse(url.lower())

    query = url_parse.parse_qs(parsed_url.query)

    for key in query.keys():
        if key.startswith("utm_") or key.startswith(custom_params):
            del query[key]

    new_query = url_parse.urlencode(query, doseq=True)

    new_url = url_parse.urlunparse(
        (parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, new_query, parsed_url.fragment)
    )

    return new_url


[docs]def is_site_url(url: str, domain: str) -> bool:
    """
    Determine if a URL is a site URL.

    :param url: The URL to check.
    :type url: str
    :param domain: The domain to check against.
    :type domain: str
    :return: Whether or not the URL is a site URL.
    :rtype: bool

    :raises ValueError: If the URL does not include a scheme.

    Example:

    .. code-block:: python

        import indieweb_utils

        url = "https://jamesg.blog/indieweb/"
        domain = "jamesg.blog"

        is_site_url = indieweb_utils.is_site_url(url, domain)

        print(is_site_url) # True
    """

    parsed_url = url_parse.urlparse(url)

    if parsed_url.scheme == "":
        raise ValueError("URL must include a scheme")

    if parsed_url.netloc == "":
        return False

    return url_parse.urlsplit(url).netloc == domain


[docs]def slugify(url: str, remove_extension: bool = False, allowed_chars: list = ["-", "/", "_", "."]) -> str:
    """
    Turn a URL into a slug. Only alphanumeric characters, periods, dashes, and underscores are allowed in the resulting slug,
    unless an allowed_chars list is provided.

    :param url: The URL to slugify.
    :type url: str
    :param remove_extension: Whether or not to remove the file extension from the slug.
    :type remove_extension: bool
    :param allowed_chars: A list of allowed characters.
    :type allowed_chars: list
    :return: A slugified URL.

    Example:

        from indieweb.utils import slugify

        slugify("https://jamesg.blog/indieweb.html", True) # https://jamesg.blog/indieweb/
        slugify("indieweb.html", True) # /indieweb/
    """
    if _is_http_url(url):
        parsed_url = url_parse.urlparse(url)
        full_url = url_parse.unquote(parsed_url.path)
    else:
        full_url = url_parse.unquote(url)

    # replace all space / with -
    full_url = full_url.replace(" /", "/test.md")

    # get file extension
    extension = full_url.split(".")[-1]

    if remove_extension and extension:
        # remove file extension
        full_url = full_url.replace(f".{extension}", "/")

    path = "".join([char for char in full_url.replace(" ", "-") if char.isalnum() or char in allowed_chars])

    if url.startswith("http"):
        # recompose url and replace
        return parsed_url._replace(path=path).geturl()

    path = path.lstrip("/")
    path = "/" + path

    return path