diff --git a/backend/app/pypi_proxy.py b/backend/app/pypi_proxy.py index aba1613..942bfdb 100644 --- a/backend/app/pypi_proxy.py +++ b/backend/app/pypi_proxy.py @@ -81,7 +81,7 @@ def _get_basic_auth(source) -> Optional[tuple[str, str]]: return None -def _rewrite_package_links(html: str, base_url: str, package_name: str) -> str: +def _rewrite_package_links(html: str, base_url: str, package_name: str, upstream_base_url: str) -> str: """ Rewrite download links in a PyPI simple page to go through our proxy. @@ -89,6 +89,7 @@ def _rewrite_package_links(html: str, base_url: str, package_name: str) -> str: html: The HTML content from upstream base_url: Our server's base URL package_name: The package name for the URL path + upstream_base_url: The upstream URL used to fetch this page (for resolving relative URLs) Returns: HTML with rewritten download links @@ -96,19 +97,31 @@ def _rewrite_package_links(html: str, base_url: str, package_name: str) -> str: # Pattern to match href attributes in anchor tags # PyPI simple pages have links like: # file.tar.gz + # Or relative URLs from Artifactory like: + # def replace_href(match): original_url = match.group(1) + + # Resolve relative URLs to absolute using the upstream base URL + if not original_url.startswith(('http://', 'https://')): + # Split off fragment before resolving + url_without_fragment = original_url.split('#')[0] + fragment_part = original_url[len(url_without_fragment):] + absolute_url = urljoin(upstream_base_url, url_without_fragment) + fragment_part + else: + absolute_url = original_url + # Extract the filename from the URL - parsed = urlparse(original_url) + parsed = urlparse(absolute_url) path_parts = parsed.path.split('/') filename = path_parts[-1] if path_parts else '' # Keep the hash fragment if present fragment = f"#{parsed.fragment}" if parsed.fragment else "" - # Encode the original URL for safe transmission - encoded_url = quote(original_url.split('#')[0], safe='') + # Encode the absolute URL (without fragment) for safe transmission + encoded_url = quote(absolute_url.split('#')[0], safe='') # Build new URL pointing to our proxy new_url = f"{base_url}/pypi/simple/{package_name}/{filename}?upstream={encoded_url}{fragment}" @@ -236,6 +249,7 @@ async def pypi_package_versions( auth = _get_basic_auth(source) package_url = source.url.rstrip('/') + f'/simple/{normalized_name}/' + final_url = package_url # Track final URL after redirects timeout = httpx.Timeout(PROXY_READ_TIMEOUT, connect=PROXY_CONNECT_TIMEOUT) @@ -255,7 +269,9 @@ async def pypi_package_versions( # Make redirect URL absolute if needed if not redirect_url.startswith('http'): - redirect_url = urljoin(package_url, redirect_url) + redirect_url = urljoin(final_url, redirect_url) + + final_url = redirect_url # Update final URL response = client.get( redirect_url, @@ -269,7 +285,8 @@ async def pypi_package_versions( content = response.text # Rewrite download links to go through our proxy - content = _rewrite_package_links(content, base_url, normalized_name) + # Pass final_url so relative URLs can be resolved correctly + content = _rewrite_package_links(content, base_url, normalized_name, final_url) return HTMLResponse(content=content)