Fix relative URL handling in PyPI proxy

Artifactory and other registries may return relative URLs in their
Simple API responses (e.g., ../../packages/...). The proxy now resolves
these to absolute URLs using urljoin() before encoding them in the
upstream parameter.

This fixes package downloads failing when the upstream registry uses
relative URLs in its package index.
This commit is contained in:
Mondo Diaz
2026-01-29 18:01:19 -06:00
parent bdfed77cb1
commit 64bfd3902f

View File

@@ -81,7 +81,7 @@ def _get_basic_auth(source) -> Optional[tuple[str, str]]:
return None return None
def _rewrite_package_links(html: str, base_url: str, package_name: str) -> str: def _rewrite_package_links(html: str, base_url: str, package_name: str, upstream_base_url: str) -> str:
""" """
Rewrite download links in a PyPI simple page to go through our proxy. Rewrite download links in a PyPI simple page to go through our proxy.
@@ -89,6 +89,7 @@ def _rewrite_package_links(html: str, base_url: str, package_name: str) -> str:
html: The HTML content from upstream html: The HTML content from upstream
base_url: Our server's base URL base_url: Our server's base URL
package_name: The package name for the URL path package_name: The package name for the URL path
upstream_base_url: The upstream URL used to fetch this page (for resolving relative URLs)
Returns: Returns:
HTML with rewritten download links HTML with rewritten download links
@@ -96,19 +97,31 @@ def _rewrite_package_links(html: str, base_url: str, package_name: str) -> str:
# Pattern to match href attributes in anchor tags # Pattern to match href attributes in anchor tags
# PyPI simple pages have links like: # PyPI simple pages have links like:
# <a href="https://files.pythonhosted.org/packages/.../file.tar.gz#sha256=...">file.tar.gz</a> # <a href="https://files.pythonhosted.org/packages/.../file.tar.gz#sha256=...">file.tar.gz</a>
# Or relative URLs from Artifactory like:
# <a href="../../packages/packages/62/35/.../requests-0.10.0.tar.gz#sha256=...">
def replace_href(match): def replace_href(match):
original_url = match.group(1) original_url = match.group(1)
# Resolve relative URLs to absolute using the upstream base URL
if not original_url.startswith(('http://', 'https://')):
# Split off fragment before resolving
url_without_fragment = original_url.split('#')[0]
fragment_part = original_url[len(url_without_fragment):]
absolute_url = urljoin(upstream_base_url, url_without_fragment) + fragment_part
else:
absolute_url = original_url
# Extract the filename from the URL # Extract the filename from the URL
parsed = urlparse(original_url) parsed = urlparse(absolute_url)
path_parts = parsed.path.split('/') path_parts = parsed.path.split('/')
filename = path_parts[-1] if path_parts else '' filename = path_parts[-1] if path_parts else ''
# Keep the hash fragment if present # Keep the hash fragment if present
fragment = f"#{parsed.fragment}" if parsed.fragment else "" fragment = f"#{parsed.fragment}" if parsed.fragment else ""
# Encode the original URL for safe transmission # Encode the absolute URL (without fragment) for safe transmission
encoded_url = quote(original_url.split('#')[0], safe='') encoded_url = quote(absolute_url.split('#')[0], safe='')
# Build new URL pointing to our proxy # Build new URL pointing to our proxy
new_url = f"{base_url}/pypi/simple/{package_name}/{filename}?upstream={encoded_url}{fragment}" new_url = f"{base_url}/pypi/simple/{package_name}/{filename}?upstream={encoded_url}{fragment}"
@@ -236,6 +249,7 @@ async def pypi_package_versions(
auth = _get_basic_auth(source) auth = _get_basic_auth(source)
package_url = source.url.rstrip('/') + f'/simple/{normalized_name}/' package_url = source.url.rstrip('/') + f'/simple/{normalized_name}/'
final_url = package_url # Track final URL after redirects
timeout = httpx.Timeout(PROXY_READ_TIMEOUT, connect=PROXY_CONNECT_TIMEOUT) timeout = httpx.Timeout(PROXY_READ_TIMEOUT, connect=PROXY_CONNECT_TIMEOUT)
@@ -255,7 +269,9 @@ async def pypi_package_versions(
# Make redirect URL absolute if needed # Make redirect URL absolute if needed
if not redirect_url.startswith('http'): if not redirect_url.startswith('http'):
redirect_url = urljoin(package_url, redirect_url) redirect_url = urljoin(final_url, redirect_url)
final_url = redirect_url # Update final URL
response = client.get( response = client.get(
redirect_url, redirect_url,
@@ -269,7 +285,8 @@ async def pypi_package_versions(
content = response.text content = response.text
# Rewrite download links to go through our proxy # Rewrite download links to go through our proxy
content = _rewrite_package_links(content, base_url, normalized_name) # Pass final_url so relative URLs can be resolved correctly
content = _rewrite_package_links(content, base_url, normalized_name, final_url)
return HTMLResponse(content=content) return HTMLResponse(content=content)