Merge branch 'feature/transparent-proxy' into 'main'

Add transparent PyPI proxy and improve upstream sources UI Closes #108 See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!56
Add transparent PyPI proxy and improve upstream sources UI
2026-01-29 16:12:57 -06:00 · 2026-01-29 16:12:57 -06:00 · 2026-01-29 14:25:19 -06:00 · 2026-01-29 14:25:19 -06:00
9 changed files with 912 additions and 17 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

 ## [Unreleased]
+### Added
+- Added transparent PyPI proxy implementing PEP 503 Simple API (#108)
+  - `GET /pypi/simple/` - package index (proxied from upstream)
+  - `GET /pypi/simple/{package}/` - version list with rewritten download links
+  - `GET /pypi/simple/{package}/{filename}` - download with automatic caching
+  - Allows `pip install --index-url https://orchard.../pypi/simple/ <package>`
+  - Artifacts cached on first access through configured upstream sources
+- Added `POST /api/v1/cache/resolve` endpoint to cache packages by coordinates instead of URL (#108)
+
+### Changed
+- Upstream sources table text is now centered under column headers (#108)
+- ENV badge now appears inline with source name instead of separate column (#108)
+- Test and Edit buttons now have more prominent button styling (#108)
+- Reduced footer padding for cleaner layout (#108)
+
 ### Fixed
 - Fixed purge_seed_data crash when deleting access permissions - was comparing UUID to VARCHAR column (#107)

--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -11,6 +11,7 @@ from slowapi.errors import RateLimitExceeded
 from .config import get_settings
 from .database import init_db, SessionLocal
 from .routes import router
+from .pypi_proxy import router as pypi_router
 from .seed import seed_database
 from .auth import create_default_admin
 from .rate_limit import limiter
@@ -65,6 +66,7 @@ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)

 # Include API routes
 app.include_router(router)
+app.include_router(pypi_router)

 # Serve static files (React build) if the directory exists
 static_dir = os.path.join(os.path.dirname(__file__), "..", "..", "frontend", "dist")
--- a/backend/app/pypi_proxy.py
+++ b/backend/app/pypi_proxy.py
@@ -0,0 +1,543 @@
+"""
+Transparent PyPI proxy implementing PEP 503 (Simple API).
+
+Provides endpoints that allow pip to use Orchard as a PyPI index URL.
+Artifacts are cached on first access through configured upstream sources.
+"""
+
+import hashlib
+import logging
+import re
+from typing import Optional
+from urllib.parse import urljoin, urlparse, quote, unquote
+
+import httpx
+from fastapi import APIRouter, Depends, HTTPException, Request, Response
+from fastapi.responses import StreamingResponse, HTMLResponse
+from sqlalchemy.orm import Session
+
+from .database import get_db
+from .models import UpstreamSource, CachedUrl, Artifact, Project, Package, Tag
+from .storage import S3Storage, get_storage
+from .upstream import (
+    UpstreamClient,
+    UpstreamClientConfig,
+    UpstreamHTTPError,
+    UpstreamConnectionError,
+    UpstreamTimeoutError,
+)
+from .config import get_env_upstream_sources
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/pypi", tags=["pypi-proxy"])
+
+# Timeout configuration for proxy requests
+PROXY_CONNECT_TIMEOUT = 30.0
+PROXY_READ_TIMEOUT = 60.0
+
+
+def _get_pypi_upstream_sources(db: Session) -> list[UpstreamSource]:
+    """Get all enabled upstream sources configured for PyPI."""
+    # Get database sources
+    db_sources = (
+        db.query(UpstreamSource)
+        .filter(
+            UpstreamSource.source_type == "pypi",
+            UpstreamSource.enabled == True,
+        )
+        .order_by(UpstreamSource.priority)
+        .all()
+    )
+
+    # Get env sources
+    env_sources = [
+        s for s in get_env_upstream_sources()
+        if s.source_type == "pypi" and s.enabled
+    ]
+
+    # Combine and sort by priority
+    all_sources = list(db_sources) + list(env_sources)
+    return sorted(all_sources, key=lambda s: s.priority)
+
+
+def _build_auth_headers(source) -> dict:
+    """Build authentication headers for an upstream source."""
+    headers = {}
+
+    if hasattr(source, 'auth_type'):
+        if source.auth_type == "bearer":
+            password = source.get_password() if hasattr(source, 'get_password') else getattr(source, 'password', None)
+            if password:
+                headers["Authorization"] = f"Bearer {password}"
+        elif source.auth_type == "api_key":
+            custom_headers = source.get_headers() if hasattr(source, 'get_headers') else {}
+            if custom_headers:
+                headers.update(custom_headers)
+
+    return headers
+
+
+def _get_basic_auth(source) -> Optional[tuple[str, str]]:
+    """Get basic auth credentials if applicable."""
+    if hasattr(source, 'auth_type') and source.auth_type == "basic":
+        username = getattr(source, 'username', None)
+        if username:
+            password = source.get_password() if hasattr(source, 'get_password') else getattr(source, 'password', '')
+            return (username, password or '')
+    return None
+
+
+def _rewrite_package_links(html: str, base_url: str, package_name: str) -> str:
+    """
+    Rewrite download links in a PyPI simple page to go through our proxy.
+
+    Args:
+        html: The HTML content from upstream
+        base_url: Our server's base URL
+        package_name: The package name for the URL path
+
+    Returns:
+        HTML with rewritten download links
+    """
+    # Pattern to match href attributes in anchor tags
+    # PyPI simple pages have links like:
+    # <a href="https://files.pythonhosted.org/packages/.../file.tar.gz#sha256=...">file.tar.gz</a>
+
+    def replace_href(match):
+        original_url = match.group(1)
+        # Extract the filename from the URL
+        parsed = urlparse(original_url)
+        path_parts = parsed.path.split('/')
+        filename = path_parts[-1] if path_parts else ''
+
+        # Keep the hash fragment if present
+        fragment = f"#{parsed.fragment}" if parsed.fragment else ""
+
+        # Encode the original URL for safe transmission
+        encoded_url = quote(original_url.split('#')[0], safe='')
+
+        # Build new URL pointing to our proxy
+        new_url = f"{base_url}/pypi/simple/{package_name}/{filename}?upstream={encoded_url}{fragment}"
+
+        return f'href="{new_url}"'
+
+    # Match href="..." patterns
+    rewritten = re.sub(r'href="([^"]+)"', replace_href, html)
+
+    return rewritten
+
+
+@router.get("/simple/")
+async def pypi_simple_index(
+    request: Request,
+    db: Session = Depends(get_db),
+):
+    """
+    PyPI Simple API index - lists all packages.
+
+    Proxies to the first available upstream PyPI source.
+    """
+    sources = _get_pypi_upstream_sources(db)
+
+    if not sources:
+        raise HTTPException(
+            status_code=503,
+            detail="No PyPI upstream sources configured"
+        )
+
+    # Try each source in priority order
+    last_error = None
+    for source in sources:
+        try:
+            headers = {"User-Agent": "Orchard-PyPI-Proxy/1.0"}
+            headers.update(_build_auth_headers(source))
+            auth = _get_basic_auth(source)
+
+            simple_url = source.url.rstrip('/') + '/simple/'
+
+            timeout = httpx.Timeout(
+                connect=PROXY_CONNECT_TIMEOUT,
+                read=PROXY_READ_TIMEOUT,
+            )
+
+            with httpx.Client(timeout=timeout, follow_redirects=False) as client:
+                response = client.get(
+                    simple_url,
+                    headers=headers,
+                    auth=auth,
+                )
+
+                # Handle redirects manually to avoid loops
+                if response.status_code in (301, 302, 303, 307, 308):
+                    redirect_url = response.headers.get('location')
+                    if redirect_url:
+                        # Follow the redirect once
+                        response = client.get(
+                            redirect_url,
+                            headers=headers,
+                            auth=auth,
+                            follow_redirects=False,
+                        )
+
+                if response.status_code == 200:
+                    # Return the index as-is (links are to package pages, not files)
+                    # We could rewrite these too, but for now just proxy
+                    content = response.text
+
+                    # Rewrite package links to go through our proxy
+                    base_url = str(request.base_url).rstrip('/')
+                    content = re.sub(
+                        r'href="([^"]+)/"',
+                        lambda m: f'href="{base_url}/pypi/simple/{m.group(1)}/"',
+                        content
+                    )
+
+                    return HTMLResponse(content=content)
+
+                last_error = f"HTTP {response.status_code}"
+
+        except httpx.ConnectError as e:
+            last_error = f"Connection failed: {e}"
+            logger.warning(f"PyPI proxy: failed to connect to {source.url}: {e}")
+        except httpx.TimeoutException as e:
+            last_error = f"Timeout: {e}"
+            logger.warning(f"PyPI proxy: timeout connecting to {source.url}: {e}")
+        except Exception as e:
+            last_error = str(e)
+            logger.warning(f"PyPI proxy: error fetching from {source.url}: {e}")
+
+    raise HTTPException(
+        status_code=502,
+        detail=f"Failed to fetch package index from upstream: {last_error}"
+    )
+
+
+@router.get("/simple/{package_name}/")
+async def pypi_package_versions(
+    request: Request,
+    package_name: str,
+    db: Session = Depends(get_db),
+):
+    """
+    PyPI Simple API package page - lists all versions/files for a package.
+
+    Proxies to upstream and rewrites download links to go through our cache.
+    """
+    sources = _get_pypi_upstream_sources(db)
+
+    if not sources:
+        raise HTTPException(
+            status_code=503,
+            detail="No PyPI upstream sources configured"
+        )
+
+    base_url = str(request.base_url).rstrip('/')
+
+    # Normalize package name (PEP 503)
+    normalized_name = re.sub(r'[-_.]+', '-', package_name).lower()
+
+    # Try each source in priority order
+    last_error = None
+    for source in sources:
+        try:
+            headers = {"User-Agent": "Orchard-PyPI-Proxy/1.0"}
+            headers.update(_build_auth_headers(source))
+            auth = _get_basic_auth(source)
+
+            package_url = source.url.rstrip('/') + f'/simple/{normalized_name}/'
+
+            timeout = httpx.Timeout(
+                connect=PROXY_CONNECT_TIMEOUT,
+                read=PROXY_READ_TIMEOUT,
+            )
+
+            with httpx.Client(timeout=timeout, follow_redirects=False) as client:
+                response = client.get(
+                    package_url,
+                    headers=headers,
+                    auth=auth,
+                )
+
+                # Handle redirects manually
+                redirect_count = 0
+                while response.status_code in (301, 302, 303, 307, 308) and redirect_count < 5:
+                    redirect_url = response.headers.get('location')
+                    if not redirect_url:
+                        break
+
+                    # Make redirect URL absolute if needed
+                    if not redirect_url.startswith('http'):
+                        redirect_url = urljoin(package_url, redirect_url)
+
+                    response = client.get(
+                        redirect_url,
+                        headers=headers,
+                        auth=auth,
+                        follow_redirects=False,
+                    )
+                    redirect_count += 1
+
+                if response.status_code == 200:
+                    content = response.text
+
+                    # Rewrite download links to go through our proxy
+                    content = _rewrite_package_links(content, base_url, normalized_name)
+
+                    return HTMLResponse(content=content)
+
+                if response.status_code == 404:
+                    # Package not found in this source, try next
+                    last_error = f"Package not found in {source.name}"
+                    continue
+
+                last_error = f"HTTP {response.status_code}"
+
+        except httpx.ConnectError as e:
+            last_error = f"Connection failed: {e}"
+            logger.warning(f"PyPI proxy: failed to connect to {source.url}: {e}")
+        except httpx.TimeoutException as e:
+            last_error = f"Timeout: {e}"
+            logger.warning(f"PyPI proxy: timeout connecting to {source.url}: {e}")
+        except Exception as e:
+            last_error = str(e)
+            logger.warning(f"PyPI proxy: error fetching {package_name} from {source.url}: {e}")
+
+    raise HTTPException(
+        status_code=404,
+        detail=f"Package '{package_name}' not found: {last_error}"
+    )
+
+
+@router.get("/simple/{package_name}/{filename}")
+async def pypi_download_file(
+    request: Request,
+    package_name: str,
+    filename: str,
+    upstream: Optional[str] = None,
+    db: Session = Depends(get_db),
+    storage: S3Storage = Depends(get_storage),
+):
+    """
+    Download a package file, caching it in Orchard.
+
+    Args:
+        package_name: The package name
+        filename: The filename to download
+        upstream: URL-encoded upstream URL to fetch from
+    """
+    if not upstream:
+        raise HTTPException(
+            status_code=400,
+            detail="Missing 'upstream' query parameter with source URL"
+        )
+
+    # Decode the upstream URL
+    upstream_url = unquote(upstream)
+
+    # Check if we already have this URL cached
+    url_hash = hashlib.sha256(upstream_url.encode()).hexdigest()
+    cached_url = db.query(CachedUrl).filter(CachedUrl.url_hash == url_hash).first()
+
+    if cached_url:
+        # Serve from cache
+        artifact = db.query(Artifact).filter(Artifact.id == cached_url.artifact_id).first()
+        if artifact:
+            logger.info(f"PyPI proxy: serving cached {filename} (artifact {artifact.id[:12]})")
+
+            # Stream from S3
+            try:
+                content_stream = storage.get_artifact_stream(artifact.id)
+
+                return StreamingResponse(
+                    content_stream,
+                    media_type=artifact.content_type or "application/octet-stream",
+                    headers={
+                        "Content-Disposition": f'attachment; filename="{filename}"',
+                        "Content-Length": str(artifact.size),
+                        "X-Checksum-SHA256": artifact.id,
+                        "X-Cache": "HIT",
+                    }
+                )
+            except Exception as e:
+                logger.error(f"PyPI proxy: error streaming cached artifact: {e}")
+                # Fall through to fetch from upstream
+
+    # Not cached - fetch from upstream
+    sources = _get_pypi_upstream_sources(db)
+
+    # Find a source that matches the upstream URL
+    matched_source = None
+    for source in sources:
+        source_url = getattr(source, 'url', '')
+        # Check if the upstream URL could come from this source
+        # (This is a loose check - the URL might be from files.pythonhosted.org)
+        if urlparse(upstream_url).netloc in source_url or True:  # Allow any source for now
+            matched_source = source
+            break
+
+    if not matched_source and sources:
+        matched_source = sources[0]  # Use first source for auth if available
+
+    try:
+        headers = {"User-Agent": "Orchard-PyPI-Proxy/1.0"}
+        if matched_source:
+            headers.update(_build_auth_headers(matched_source))
+        auth = _get_basic_auth(matched_source) if matched_source else None
+
+        timeout = httpx.Timeout(
+            connect=PROXY_CONNECT_TIMEOUT,
+            read=300.0,  # 5 minutes for large files
+        )
+
+        # Fetch the file
+        logger.info(f"PyPI proxy: fetching {filename} from {upstream_url}")
+
+        with httpx.Client(timeout=timeout, follow_redirects=False) as client:
+            response = client.get(
+                upstream_url,
+                headers=headers,
+                auth=auth,
+            )
+
+            # Handle redirects manually
+            redirect_count = 0
+            while response.status_code in (301, 302, 303, 307, 308) and redirect_count < 5:
+                redirect_url = response.headers.get('location')
+                if not redirect_url:
+                    break
+
+                if not redirect_url.startswith('http'):
+                    redirect_url = urljoin(upstream_url, redirect_url)
+
+                logger.info(f"PyPI proxy: following redirect to {redirect_url}")
+
+                # Don't send auth to different hosts
+                redirect_headers = {"User-Agent": "Orchard-PyPI-Proxy/1.0"}
+                redirect_auth = None
+                if urlparse(redirect_url).netloc == urlparse(upstream_url).netloc:
+                    redirect_headers.update(headers)
+                    redirect_auth = auth
+
+                response = client.get(
+                    redirect_url,
+                    headers=redirect_headers,
+                    auth=redirect_auth,
+                    follow_redirects=False,
+                )
+                redirect_count += 1
+
+            if response.status_code != 200:
+                raise HTTPException(
+                    status_code=response.status_code,
+                    detail=f"Upstream returned {response.status_code}"
+                )
+
+            content = response.content
+            content_type = response.headers.get('content-type', 'application/octet-stream')
+
+        # Compute hash
+        sha256 = hashlib.sha256(content).hexdigest()
+        size = len(content)
+
+        logger.info(f"PyPI proxy: downloaded {filename}, {size} bytes, sha256={sha256[:12]}")
+
+        # Store in S3
+        from io import BytesIO
+        artifact = storage.store_artifact(
+            file_obj=BytesIO(content),
+            filename=filename,
+            content_type=content_type,
+        )
+
+        # Check if artifact already exists
+        existing = db.query(Artifact).filter(Artifact.id == sha256).first()
+        if existing:
+            # Increment ref count
+            existing.ref_count += 1
+            db.flush()
+        else:
+            # Create artifact record
+            new_artifact = Artifact(
+                id=sha256,
+                filename=filename,
+                content_type=content_type,
+                size=size,
+                ref_count=1,
+            )
+            db.add(new_artifact)
+            db.flush()
+
+        # Create/get system project and package
+        system_project = db.query(Project).filter(Project.name == "_pypi").first()
+        if not system_project:
+            system_project = Project(
+                name="_pypi",
+                description="System project for cached PyPI packages",
+                visibility="private",
+            )
+            db.add(system_project)
+            db.flush()
+
+        # Normalize package name
+        normalized_name = re.sub(r'[-_.]+', '-', package_name).lower()
+
+        package = db.query(Package).filter(
+            Package.project_id == system_project.id,
+            Package.name == normalized_name,
+        ).first()
+        if not package:
+            package = Package(
+                project_id=system_project.id,
+                name=normalized_name,
+                description=f"PyPI package: {normalized_name}",
+            )
+            db.add(package)
+            db.flush()
+
+        # Create tag with filename
+        existing_tag = db.query(Tag).filter(
+            Tag.package_id == package.id,
+            Tag.name == filename,
+        ).first()
+        if not existing_tag:
+            tag = Tag(
+                package_id=package.id,
+                name=filename,
+                artifact_id=sha256,
+            )
+            db.add(tag)
+
+        # Cache the URL mapping
+        existing_cached = db.query(CachedUrl).filter(CachedUrl.url_hash == url_hash).first()
+        if not existing_cached:
+            cached_url_record = CachedUrl(
+                url_hash=url_hash,
+                url=upstream_url,
+                artifact_id=sha256,
+            )
+            db.add(cached_url_record)
+
+        db.commit()
+
+        # Return the file
+        return Response(
+            content=content,
+            media_type=content_type,
+            headers={
+                "Content-Disposition": f'attachment; filename="{filename}"',
+                "Content-Length": str(size),
+                "X-Checksum-SHA256": sha256,
+                "X-Cache": "MISS",
+            }
+        )
+
+    except httpx.ConnectError as e:
+        raise HTTPException(status_code=502, detail=f"Connection failed: {e}")
+    except httpx.TimeoutException as e:
+        raise HTTPException(status_code=504, detail=f"Timeout: {e}")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception(f"PyPI proxy: error downloading {filename}")
+        raise HTTPException(status_code=500, detail=str(e))
--- a/backend/app/routes.py
+++ b/backend/app/routes.py
@@ -8305,6 +8305,200 @@ def _create_user_cache_reference(
    return f"{user_project_name}/{user_package_name}"


+# --- Cache Resolve Endpoint ---
+
+from .schemas import CacheResolveRequest
+
+
+@router.post(
+    "/api/v1/cache/resolve",
+    response_model=CacheResponse,
+    tags=["cache"],
+    summary="Cache an artifact by package coordinates",
+)
+def cache_resolve(
+    request: Request,
+    resolve_request: CacheResolveRequest,
+    db: Session = Depends(get_db),
+    storage: S3Storage = Depends(get_storage),
+    current_user: User = Depends(get_current_user),
+):
+    """
+    Cache an artifact by package coordinates (no URL required).
+
+    The server finds the appropriate download URL based on source_type
+    and configured upstream sources. Currently supports PyPI packages.
+
+    **Request Body:**
+    - `source_type` (required): Type of source (pypi, npm, maven, etc.)
+    - `package` (required): Package name
+    - `version` (required): Package version
+    - `user_project` (optional): Also create reference in this user project
+    - `user_package` (optional): Package name in user project
+    - `user_tag` (optional): Tag name in user project
+
+    **Example (curl):**
+    ```bash
+    curl -X POST "http://localhost:8080/api/v1/cache/resolve" \\
+      -H "Authorization: Bearer <api-key>" \\
+      -H "Content-Type: application/json" \\
+      -d '{
+        "source_type": "pypi",
+        "package": "requests",
+        "version": "2.31.0"
+      }'
+    ```
+    """
+    import re
+    import httpx
+    from urllib.parse import quote, unquote
+
+    if resolve_request.source_type != "pypi":
+        raise HTTPException(
+            status_code=501,
+            detail=f"Cache resolve for '{resolve_request.source_type}' not yet implemented. Currently only 'pypi' is supported."
+        )
+
+    # Get PyPI upstream sources
+    sources = (
+        db.query(UpstreamSource)
+        .filter(
+            UpstreamSource.source_type == "pypi",
+            UpstreamSource.enabled == True,
+        )
+        .order_by(UpstreamSource.priority)
+        .all()
+    )
+
+    # Also get env sources
+    env_sources = [
+        s for s in get_env_upstream_sources()
+        if s.source_type == "pypi" and s.enabled
+    ]
+    all_sources = list(sources) + list(env_sources)
+    all_sources = sorted(all_sources, key=lambda s: s.priority)
+
+    if not all_sources:
+        raise HTTPException(
+            status_code=503,
+            detail="No PyPI upstream sources configured"
+        )
+
+    # Normalize package name (PEP 503)
+    normalized_package = re.sub(r'[-_.]+', '-', resolve_request.package).lower()
+
+    # Query the Simple API to find the download URL
+    download_url = None
+    matched_filename = None
+    last_error = None
+
+    for source in all_sources:
+        try:
+            headers = {"User-Agent": "Orchard-CacheResolver/1.0"}
+
+            # Build auth if needed
+            if hasattr(source, 'auth_type'):
+                if source.auth_type == "bearer":
+                    password = source.get_password() if hasattr(source, 'get_password') else getattr(source, 'password', None)
+                    if password:
+                        headers["Authorization"] = f"Bearer {password}"
+                elif source.auth_type == "api_key":
+                    custom_headers = source.get_headers() if hasattr(source, 'get_headers') else {}
+                    if custom_headers:
+                        headers.update(custom_headers)
+
+            auth = None
+            if hasattr(source, 'auth_type') and source.auth_type == "basic":
+                username = getattr(source, 'username', None)
+                if username:
+                    password = source.get_password() if hasattr(source, 'get_password') else getattr(source, 'password', '')
+                    auth = (username, password or '')
+
+            source_url = getattr(source, 'url', '')
+            package_url = source_url.rstrip('/') + f'/simple/{normalized_package}/'
+
+            timeout = httpx.Timeout(connect=30.0, read=60.0)
+
+            with httpx.Client(timeout=timeout, follow_redirects=True) as client:
+                response = client.get(package_url, headers=headers, auth=auth)
+
+                if response.status_code == 404:
+                    last_error = f"Package not found in {getattr(source, 'name', 'source')}"
+                    continue
+
+                if response.status_code != 200:
+                    last_error = f"HTTP {response.status_code} from {getattr(source, 'name', 'source')}"
+                    continue
+
+                # Parse HTML to find the version
+                html = response.text
+                # Look for links containing the version
+                # Pattern: href="...{package}-{version}...#sha256=..."
+                version_pattern = re.escape(resolve_request.version)
+                link_pattern = rf'href="([^"]+{normalized_package}[^"]*{version_pattern}[^"]*)"'
+
+                matches = re.findall(link_pattern, html, re.IGNORECASE)
+
+                if not matches:
+                    # Try with original package name
+                    link_pattern = rf'href="([^"]+{re.escape(resolve_request.package)}[^"]*{version_pattern}[^"]*)"'
+                    matches = re.findall(link_pattern, html, re.IGNORECASE)
+
+                if matches:
+                    # Prefer .tar.gz or .whl files
+                    for match in matches:
+                        url = match.split('#')[0]  # Remove hash fragment
+                        if url.endswith('.tar.gz') or url.endswith('.whl'):
+                            download_url = url
+                            # Extract filename
+                            matched_filename = url.split('/')[-1]
+                            break
+                    if not download_url:
+                        # Use first match
+                        download_url = matches[0].split('#')[0]
+                        matched_filename = download_url.split('/')[-1]
+                    break
+
+                last_error = f"Version {resolve_request.version} not found for {resolve_request.package}"
+
+        except httpx.ConnectError as e:
+            last_error = f"Connection failed: {e}"
+            logger.warning(f"Cache resolve: failed to connect to {getattr(source, 'url', 'source')}: {e}")
+        except httpx.TimeoutException as e:
+            last_error = f"Timeout: {e}"
+            logger.warning(f"Cache resolve: timeout connecting to {getattr(source, 'url', 'source')}: {e}")
+        except Exception as e:
+            last_error = str(e)
+            logger.warning(f"Cache resolve: error: {e}")
+
+    if not download_url:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Could not find {resolve_request.package}=={resolve_request.version}: {last_error}"
+        )
+
+    # Now cache the artifact using the existing cache_artifact logic
+    # Construct a CacheRequest
+    cache_request = CacheRequest(
+        url=download_url,
+        source_type="pypi",
+        package_name=normalized_package,
+        tag=matched_filename or resolve_request.version,
+        user_project=resolve_request.user_project,
+        user_package=resolve_request.user_package,
+        user_tag=resolve_request.user_tag,
+    )
+
+    # Call the cache logic
+    return cache_artifact(
+        request=request,
+        cache_request=cache_request,
+        db=db,
+        storage=storage,
+        current_user=current_user,
+    )
+
+
 # --- Upstream Sources Admin API ---

 from .schemas import (
--- a/backend/app/schemas.py
+++ b/backend/app/schemas.py
@@ -1432,4 +1432,41 @@ class CacheResponse(BaseModel):
    user_reference: Optional[str] = None  # e.g., "my-app/npm-deps:lodash-4.17.21"


+class CacheResolveRequest(BaseModel):
+    """Request to cache an artifact by package coordinates (no URL required).
+
+    The server will construct the appropriate URL based on source_type and
+    configured upstream sources.
+    """
+    source_type: str
+    package: str
+    version: str
+    user_project: Optional[str] = None
+    user_package: Optional[str] = None
+    user_tag: Optional[str] = None
+
+    @field_validator('source_type')
+    @classmethod
+    def validate_source_type(cls, v: str) -> str:
+        if v not in SOURCE_TYPES:
+            raise ValueError(f"source_type must be one of: {', '.join(SOURCE_TYPES)}")
+        return v
+
+    @field_validator('package')
+    @classmethod
+    def validate_package(cls, v: str) -> str:
+        v = v.strip()
+        if not v:
+            raise ValueError("package cannot be empty")
+        return v
+
+    @field_validator('version')
+    @classmethod
+    def validate_version(cls, v: str) -> str:
+        v = v.strip()
+        if not v:
+            raise ValueError("version cannot be empty")
+        return v
+
+

--- a/backend/tests/integration/test_pypi_proxy.py
+++ b/backend/tests/integration/test_pypi_proxy.py
@@ -0,0 +1,93 @@
+"""Integration tests for PyPI transparent proxy."""
+
+import os
+import pytest
+import httpx
+
+
+def get_base_url():
+    """Get the base URL for the Orchard server from environment."""
+    return os.environ.get("ORCHARD_TEST_URL", "http://localhost:8080")
+
+
+class TestPyPIProxyEndpoints:
+    """Tests for PyPI proxy endpoints.
+
+    These endpoints are public (no auth required) since pip needs to use them.
+    """
+
+    @pytest.mark.integration
+    def test_pypi_simple_index_no_sources(self):
+        """Test that /pypi/simple/ returns 503 when no sources configured."""
+        with httpx.Client(base_url=get_base_url(), timeout=30.0) as client:
+            response = client.get("/pypi/simple/")
+            # Should return 503 when no PyPI upstream sources are configured
+            assert response.status_code == 503
+            assert "No PyPI upstream sources configured" in response.json()["detail"]
+
+    @pytest.mark.integration
+    def test_pypi_package_no_sources(self):
+        """Test that /pypi/simple/{package}/ returns 503 when no sources configured."""
+        with httpx.Client(base_url=get_base_url(), timeout=30.0) as client:
+            response = client.get("/pypi/simple/requests/")
+            assert response.status_code == 503
+            assert "No PyPI upstream sources configured" in response.json()["detail"]
+
+    @pytest.mark.integration
+    def test_pypi_download_missing_upstream_param(self):
+        """Test that /pypi/simple/{package}/{filename} requires upstream param."""
+        with httpx.Client(base_url=get_base_url(), timeout=30.0) as client:
+            response = client.get("/pypi/simple/requests/requests-2.31.0.tar.gz")
+            assert response.status_code == 400
+            assert "upstream" in response.json()["detail"].lower()
+
+
+class TestPyPILinkRewriting:
+    """Tests for URL rewriting in PyPI proxy responses."""
+
+    def test_rewrite_package_links(self):
+        """Test that download links are rewritten to go through proxy."""
+        from app.pypi_proxy import _rewrite_package_links
+
+        html = '''
+        <html>
+        <body>
+        <a href="https://files.pythonhosted.org/packages/ab/cd/requests-2.31.0.tar.gz#sha256=abc123">requests-2.31.0.tar.gz</a>
+        <a href="https://files.pythonhosted.org/packages/ef/gh/requests-2.31.0-py3-none-any.whl#sha256=def456">requests-2.31.0-py3-none-any.whl</a>
+        </body>
+        </html>
+        '''
+
+        result = _rewrite_package_links(html, "http://localhost:8080", "requests")
+
+        # Links should be rewritten to go through our proxy
+        assert "/pypi/simple/requests/requests-2.31.0.tar.gz?upstream=" in result
+        assert "/pypi/simple/requests/requests-2.31.0-py3-none-any.whl?upstream=" in result
+        # Original URLs should be encoded in upstream param
+        assert "files.pythonhosted.org" in result
+        # Hash fragments should be preserved
+        assert "#sha256=abc123" in result
+        assert "#sha256=def456" in result
+
+
+class TestPyPIPackageNormalization:
+    """Tests for PyPI package name normalization."""
+
+    @pytest.mark.integration
+    def test_package_name_normalized(self):
+        """Test that package names are normalized per PEP 503."""
+        # These should all be treated the same:
+        # requests, Requests, requests_, requests-
+        # The endpoint normalizes to lowercase with hyphens
+
+        with httpx.Client(base_url=get_base_url(), timeout=30.0) as client:
+            # Without upstream sources, we get 503, but the normalization
+            # happens before the source lookup
+            response = client.get("/pypi/simple/Requests/")
+            assert response.status_code == 503  # No sources, but path was valid
+
+            response = client.get("/pypi/simple/some_package/")
+            assert response.status_code == 503
+
+            response = client.get("/pypi/simple/some-package/")
+            assert response.status_code == 503
--- a/frontend/src/components/Layout.css
+++ b/frontend/src/components/Layout.css
@@ -272,7 +272,7 @@
 .footer {
  background: var(--bg-secondary);
  border-top: 1px solid var(--border-primary);
-  padding: 24px 0;
+  padding: 12px 0;
 }

 .footer-content {
--- a/frontend/src/pages/AdminCachePage.css
+++ b/frontend/src/pages/AdminCachePage.css
@@ -65,7 +65,7 @@
 .sources-table th,
 .sources-table td {
  padding: 0.75rem 1rem;
-  text-align: left;
+  text-align: center;
  border-bottom: 1px solid var(--border-color);
 }

@@ -91,6 +91,11 @@
  white-space: nowrap;
 }

+/* Name column should be left-aligned */
+.sources-table td:first-child {
+  text-align: left;
+}
+
 .url-cell {
  font-family: monospace;
  font-size: 0.9rem;
@@ -98,6 +103,7 @@
  overflow: hidden;
  text-overflow: ellipsis;
  white-space: nowrap;
+  text-align: left;
 }

 /* Badges */
@@ -243,10 +249,22 @@
 }

 .btn-sm {
-  padding: 0.25rem 0.5rem;
+  padding: 0.25rem 0.75rem;
  font-size: 0.8rem;
 }

+.btn-secondary {
+  background-color: var(--bg-tertiary);
+  border-color: var(--border-color);
+  color: var(--text-primary);
+  font-weight: 500;
+}
+
+.btn-secondary:hover {
+  background-color: var(--bg-secondary);
+  border-color: var(--text-secondary);
+}
+
 .empty-message {
  color: var(--text-secondary);
  font-style: italic;
--- a/frontend/src/pages/AdminCachePage.tsx
+++ b/frontend/src/pages/AdminCachePage.tsx
@@ -272,8 +272,7 @@ function AdminCachePage() {
                <th>URL</th>
                <th>Priority</th>
                <th>Status</th>
-                <th>Source</th>
-                <th></th>
+                <th>Test</th>
                <th>Actions</th>
              </tr>
            </thead>
@@ -282,24 +281,18 @@ function AdminCachePage() {
                <tr key={source.id} className={source.enabled ? '' : 'disabled-row'}>
                  <td>
                    <span className="source-name">{source.name}</span>
+                    {source.source === 'env' && (
+                      <span className="env-badge" title="Defined via environment variable">ENV</span>
+                    )}
                  </td>
                  <td>{source.source_type}</td>
-                  <td className="url-cell">{source.url}</td>
+                  <td className="url-cell" title={source.url}>{source.url}</td>
                  <td>{source.priority}</td>
                  <td>
                    <span className={`status-badge ${source.enabled ? 'enabled' : 'disabled'}`}>
                      {source.enabled ? 'Enabled' : 'Disabled'}
                    </span>
                  </td>
-                  <td>
-                    {source.source === 'env' ? (
-                      <span className="env-badge" title="Defined via environment variable">
-                        ENV
-                      </span>
-                    ) : (
-                      'Database'
-                    )}
-                  </td>
                  <td className="test-cell">
                    {testingId === source.id ? (
                      <span className="test-dot testing" title="Testing...">●</span>
@@ -317,14 +310,14 @@ function AdminCachePage() {
                  </td>
                  <td className="actions-cell">
                    <button
-                      className="btn btn-sm"
+                      className="btn btn-sm btn-secondary"
                      onClick={() => handleTest(source)}
                      disabled={testingId === source.id}
                    >
                      Test
                    </button>
                    {source.source !== 'env' && (
-                      <button className="btn btn-sm" onClick={() => openEditForm(source)}>
+                      <button className="btn btn-sm btn-secondary" onClick={() => openEditForm(source)}>
                        Edit
                      </button>
                    )}
Author	SHA1	Message	Date
Mondo Diaz	dcd405679a	Merge branch 'feature/transparent-proxy' into 'main' Add transparent PyPI proxy and improve upstream sources UI Closes #108 See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!56	2026-01-29 16:12:57 -06:00
Mondo Diaz	97498b2f86	Add transparent PyPI proxy and improve upstream sources UI	2026-01-29 16:12:57 -06:00
Mondo Diaz	e8cf2462b7	Merge branch 'fix/upstream-caching-bugs-2' into 'main' Simplify cache management UI and improve test status display (#107) See merge request esv/bsf/bsf-integration/orchard/orchard-mvp!55	2026-01-29 14:25:19 -06:00
Mondo Diaz	038ad4ed1b	Simplify cache management UI and improve test status display (#107 )	2026-01-29 14:25:19 -06:00