Fix httpx.Timeout configuration in PyPI proxy

This commit is contained in:
Mondo Diaz
2026-02-05 10:31:04 -06:00
parent 11c5aee0f1
commit dfa089376a
76 changed files with 9384 additions and 4407 deletions

View File

@@ -0,0 +1,262 @@
"""
Redis-backed caching service with category-aware TTL and invalidation.
Provides:
- Immutable caching for artifact data (hermetic builds)
- TTL-based caching for discovery data
- Event-driven invalidation for config changes
- Graceful fallback when Redis unavailable
"""
import logging
from enum import Enum
from typing import Optional
from .config import Settings
logger = logging.getLogger(__name__)
class CacheCategory(Enum):
"""
Cache categories with different TTL and invalidation rules.
Immutable (cache forever):
- ARTIFACT_METADATA: Artifact info by SHA256
- ARTIFACT_DEPENDENCIES: Extracted deps by SHA256
- DEPENDENCY_RESOLUTION: Resolution results by input hash
Mutable (TTL + event invalidation):
- UPSTREAM_SOURCES: Upstream config, invalidate on DB change
- PACKAGE_INDEX: PyPI/npm index pages, TTL only
- PACKAGE_VERSIONS: Version listings, TTL only
"""
# Immutable - cache forever (hermetic builds)
ARTIFACT_METADATA = "artifact"
ARTIFACT_DEPENDENCIES = "deps"
DEPENDENCY_RESOLUTION = "resolve"
# Mutable - TTL + event invalidation
UPSTREAM_SOURCES = "upstream"
PACKAGE_INDEX = "index"
PACKAGE_VERSIONS = "versions"
def get_category_ttl(category: CacheCategory, settings: Settings) -> Optional[int]:
"""
Get TTL for a cache category.
Returns:
TTL in seconds, or None for no expiry (immutable).
"""
ttl_map = {
# Immutable - no TTL
CacheCategory.ARTIFACT_METADATA: None,
CacheCategory.ARTIFACT_DEPENDENCIES: None,
CacheCategory.DEPENDENCY_RESOLUTION: None,
# Mutable - configurable TTL
CacheCategory.UPSTREAM_SOURCES: settings.cache_ttl_upstream,
CacheCategory.PACKAGE_INDEX: settings.cache_ttl_index,
CacheCategory.PACKAGE_VERSIONS: settings.cache_ttl_versions,
}
return ttl_map.get(category)
class CacheService:
"""
Redis-backed caching with category-aware TTL.
Key format: orchard:{category}:{protocol}:{identifier}
Example: orchard:deps:pypi:abc123def456
When Redis is disabled or unavailable, operations gracefully
return None/no-op to allow the application to function without caching.
"""
def __init__(self, settings: Settings):
self._settings = settings
self._enabled = settings.redis_enabled
self._redis: Optional["redis.asyncio.Redis"] = None
self._started = False
async def startup(self) -> None:
"""Initialize Redis connection. Called by FastAPI lifespan."""
if self._started:
return
if not self._enabled:
logger.info("CacheService disabled (redis_enabled=False)")
self._started = True
return
try:
import redis.asyncio as redis
logger.info(
f"Connecting to Redis at {self._settings.redis_host}:"
f"{self._settings.redis_port}/{self._settings.redis_db}"
)
self._redis = redis.Redis(
host=self._settings.redis_host,
port=self._settings.redis_port,
db=self._settings.redis_db,
password=self._settings.redis_password,
decode_responses=False, # We handle bytes
)
# Test connection
await self._redis.ping()
logger.info("CacheService connected to Redis")
except ImportError:
logger.warning("redis package not installed, caching disabled")
self._enabled = False
except Exception as e:
logger.warning(f"Redis connection failed, caching disabled: {e}")
self._enabled = False
self._redis = None
self._started = True
async def shutdown(self) -> None:
"""Close Redis connection. Called by FastAPI lifespan."""
if not self._started:
return
if self._redis:
await self._redis.aclose()
self._redis = None
self._started = False
logger.info("CacheService shutdown complete")
@staticmethod
def _make_key(category: CacheCategory, protocol: str, identifier: str) -> str:
"""Build namespaced cache key."""
return f"orchard:{category.value}:{protocol}:{identifier}"
async def get(
self,
category: CacheCategory,
key: str,
protocol: str = "default",
) -> Optional[bytes]:
"""
Get cached value.
Args:
category: Cache category for TTL rules
key: Unique identifier within category
protocol: Protocol namespace (pypi, npm, etc.)
Returns:
Cached bytes or None if not found/disabled.
"""
if not self._enabled or not self._redis:
return None
try:
full_key = self._make_key(category, protocol, key)
return await self._redis.get(full_key)
except Exception as e:
logger.warning(f"Cache get failed for {key}: {e}")
return None
async def set(
self,
category: CacheCategory,
key: str,
value: bytes,
protocol: str = "default",
) -> None:
"""
Set cached value with category-appropriate TTL.
Args:
category: Cache category for TTL rules
key: Unique identifier within category
value: Bytes to cache
protocol: Protocol namespace (pypi, npm, etc.)
"""
if not self._enabled or not self._redis:
return
try:
full_key = self._make_key(category, protocol, key)
ttl = get_category_ttl(category, self._settings)
if ttl is None:
await self._redis.set(full_key, value)
else:
await self._redis.setex(full_key, ttl, value)
except Exception as e:
logger.warning(f"Cache set failed for {key}: {e}")
async def delete(
self,
category: CacheCategory,
key: str,
protocol: str = "default",
) -> None:
"""Delete a specific cache entry."""
if not self._enabled or not self._redis:
return
try:
full_key = self._make_key(category, protocol, key)
await self._redis.delete(full_key)
except Exception as e:
logger.warning(f"Cache delete failed for {key}: {e}")
async def invalidate_pattern(
self,
category: CacheCategory,
pattern: str = "*",
protocol: str = "default",
) -> int:
"""
Invalidate all entries matching pattern.
Args:
category: Cache category
pattern: Glob pattern for keys (default "*" = all in category)
protocol: Protocol namespace
Returns:
Number of keys deleted.
"""
if not self._enabled or not self._redis:
return 0
try:
full_pattern = self._make_key(category, protocol, pattern)
keys = []
async for key in self._redis.scan_iter(match=full_pattern):
keys.append(key)
if keys:
return await self._redis.delete(*keys)
return 0
except Exception as e:
logger.warning(f"Cache invalidate failed for pattern {pattern}: {e}")
return 0
async def ping(self) -> bool:
"""Check if Redis is connected and responding."""
if not self._enabled or not self._redis:
return False
try:
await self._redis.ping()
return True
except Exception:
return False
@property
def enabled(self) -> bool:
"""Check if caching is enabled."""
return self._enabled

View File

@@ -22,8 +22,8 @@ class Settings(BaseSettings):
database_sslmode: str = "disable"
# Database connection pool settings
database_pool_size: int = 5 # Number of connections to keep open
database_max_overflow: int = 10 # Max additional connections beyond pool_size
database_pool_size: int = 20 # Number of connections to keep open
database_max_overflow: int = 30 # Max additional connections beyond pool_size
database_pool_timeout: int = 30 # Seconds to wait for a connection from pool
database_pool_recycle: int = (
1800 # Recycle connections after this many seconds (30 min)
@@ -51,6 +51,26 @@ class Settings(BaseSettings):
presigned_url_expiry: int = (
3600 # Presigned URL expiry in seconds (default: 1 hour)
)
pypi_download_mode: str = "redirect" # "redirect" (to S3) or "proxy" (stream through Orchard)
# HTTP Client pool settings
http_max_connections: int = 100 # Max connections per pool
http_max_keepalive: int = 20 # Keep-alive connections
http_connect_timeout: float = 30.0 # Connection timeout seconds
http_read_timeout: float = 60.0 # Read timeout seconds
http_worker_threads: int = 32 # Thread pool for blocking ops
# Redis cache settings
redis_host: str = "localhost"
redis_port: int = 6379
redis_db: int = 0
redis_password: Optional[str] = None
redis_enabled: bool = True # Set False to disable caching
# Cache TTL settings (seconds, 0 = no expiry)
cache_ttl_index: int = 300 # Package index pages: 5 min
cache_ttl_versions: int = 300 # Version listings: 5 min
cache_ttl_upstream: int = 3600 # Upstream source config: 1 hour
# Logging settings
log_level: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
@@ -64,6 +84,15 @@ class Settings(BaseSettings):
# Global cache settings override (None = use DB value, True/False = override DB)
cache_auto_create_system_projects: Optional[bool] = None # Override auto_create_system_projects
# PyPI Cache Worker settings
pypi_cache_workers: int = 5 # Number of concurrent cache workers
pypi_cache_max_depth: int = 10 # Maximum recursion depth for dependency caching
pypi_cache_max_attempts: int = 3 # Maximum retry attempts for failed cache tasks
# Auto-fetch configuration for dependency resolution
auto_fetch_dependencies: bool = False # Server default for auto_fetch parameter
auto_fetch_timeout: int = 300 # Total timeout for auto-fetch resolution in seconds
# JWT Authentication settings (optional, for external identity providers)
jwt_enabled: bool = False # Enable JWT token validation
jwt_secret: str = "" # Secret key for HS256, or leave empty for RS256 with JWKS
@@ -88,6 +117,24 @@ class Settings(BaseSettings):
def is_production(self) -> bool:
return self.env.lower() == "production"
@property
def PORT(self) -> int:
"""Alias for server_port for compatibility."""
return self.server_port
# Uppercase aliases for PyPI cache settings (for backward compatibility)
@property
def PYPI_CACHE_WORKERS(self) -> int:
return self.pypi_cache_workers
@property
def PYPI_CACHE_MAX_DEPTH(self) -> int:
return self.pypi_cache_max_depth
@property
def PYPI_CACHE_MAX_ATTEMPTS(self) -> int:
return self.pypi_cache_max_attempts
class Config:
env_prefix = "ORCHARD_"
case_sensitive = False

View File

@@ -220,17 +220,7 @@ def _run_migrations():
CREATE UNIQUE INDEX idx_packages_project_name ON packages(project_id, name);
END IF;
IF NOT EXISTS (
SELECT 1 FROM pg_indexes WHERE indexname = 'idx_tags_package_name'
) THEN
CREATE UNIQUE INDEX idx_tags_package_name ON tags(package_id, name);
END IF;
IF NOT EXISTS (
SELECT 1 FROM pg_indexes WHERE indexname = 'idx_tags_package_created_at'
) THEN
CREATE INDEX idx_tags_package_created_at ON tags(package_id, created_at);
END IF;
-- Tag indexes removed: tags table no longer exists (removed in tag system removal)
END $$;
""",
),
@@ -287,27 +277,8 @@ def _run_migrations():
Migration(
name="008_create_tags_ref_count_triggers",
sql="""
DO $$
BEGIN
DROP TRIGGER IF EXISTS tags_ref_count_insert_trigger ON tags;
CREATE TRIGGER tags_ref_count_insert_trigger
AFTER INSERT ON tags
FOR EACH ROW
EXECUTE FUNCTION increment_artifact_ref_count();
DROP TRIGGER IF EXISTS tags_ref_count_delete_trigger ON tags;
CREATE TRIGGER tags_ref_count_delete_trigger
AFTER DELETE ON tags
FOR EACH ROW
EXECUTE FUNCTION decrement_artifact_ref_count();
DROP TRIGGER IF EXISTS tags_ref_count_update_trigger ON tags;
CREATE TRIGGER tags_ref_count_update_trigger
AFTER UPDATE ON tags
FOR EACH ROW
WHEN (OLD.artifact_id IS DISTINCT FROM NEW.artifact_id)
EXECUTE FUNCTION update_artifact_ref_count();
END $$;
-- Tags table removed: triggers no longer needed (tag system removed)
DO $$ BEGIN NULL; END $$;
""",
),
Migration(
@@ -354,9 +325,11 @@ def _run_migrations():
Migration(
name="011_migrate_semver_tags_to_versions",
sql=r"""
-- Migrate semver tags to versions (only if both tables exist - for existing databases)
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'package_versions') THEN
IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'package_versions')
AND EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'tags') THEN
INSERT INTO package_versions (id, package_id, artifact_id, version, version_source, created_by, created_at)
SELECT
gen_random_uuid(),
@@ -565,6 +538,62 @@ def _run_migrations():
WHERE name IN ('npm-public', 'pypi-public', 'maven-central', 'docker-hub');
""",
),
Migration(
name="024_remove_tags",
sql="""
-- Remove tag system, keeping only versions for artifact references
DO $$
BEGIN
-- Drop triggers on tags table (if they exist)
DROP TRIGGER IF EXISTS tags_ref_count_insert_trigger ON tags;
DROP TRIGGER IF EXISTS tags_ref_count_delete_trigger ON tags;
DROP TRIGGER IF EXISTS tags_ref_count_update_trigger ON tags;
DROP TRIGGER IF EXISTS tags_updated_at_trigger ON tags;
DROP TRIGGER IF EXISTS tag_changes_trigger ON tags;
-- Drop the tag change tracking function
DROP FUNCTION IF EXISTS track_tag_changes();
-- Remove tag_constraint from artifact_dependencies
IF EXISTS (
SELECT 1 FROM information_schema.table_constraints
WHERE constraint_name = 'check_constraint_type'
AND table_name = 'artifact_dependencies'
) THEN
ALTER TABLE artifact_dependencies DROP CONSTRAINT check_constraint_type;
END IF;
-- Remove the tag_constraint column if it exists
IF EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'artifact_dependencies' AND column_name = 'tag_constraint'
) THEN
ALTER TABLE artifact_dependencies DROP COLUMN tag_constraint;
END IF;
-- Make version_constraint NOT NULL
UPDATE artifact_dependencies SET version_constraint = '*' WHERE version_constraint IS NULL;
ALTER TABLE artifact_dependencies ALTER COLUMN version_constraint SET NOT NULL;
-- Drop tag_history table first (depends on tags)
DROP TABLE IF EXISTS tag_history;
-- Drop tags table
DROP TABLE IF EXISTS tags;
-- Rename uploads.tag_name to version if it exists and version doesn't
IF EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'uploads' AND column_name = 'tag_name'
) AND NOT EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name = 'uploads' AND column_name = 'version'
) THEN
ALTER TABLE uploads RENAME COLUMN tag_name TO version;
END IF;
END $$;
""",
),
]
with engine.connect() as conn:

175
backend/app/db_utils.py Normal file
View File

@@ -0,0 +1,175 @@
"""
Database utilities for optimized artifact operations.
Provides batch operations to eliminate N+1 queries.
"""
import logging
from typing import Optional
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.orm import Session
from .models import Artifact, ArtifactDependency, CachedUrl
logger = logging.getLogger(__name__)
class ArtifactRepository:
"""
Optimized database operations for artifact storage.
Key optimizations:
- Atomic upserts using ON CONFLICT
- Batch inserts for dependencies
- Joined queries to avoid N+1
"""
def __init__(self, db: Session):
self.db = db
@staticmethod
def _format_dependency_values(
artifact_id: str,
dependencies: list[tuple[str, str, str]],
) -> list[dict]:
"""
Format dependencies for batch insert.
Args:
artifact_id: SHA256 of the artifact
dependencies: List of (project, package, version_constraint)
Returns:
List of dicts ready for bulk insert.
"""
return [
{
"artifact_id": artifact_id,
"dependency_project": proj,
"dependency_package": pkg,
"version_constraint": ver,
}
for proj, pkg, ver in dependencies
]
def get_or_create_artifact(
self,
sha256: str,
size: int,
filename: str,
content_type: Optional[str] = None,
created_by: str = "system",
s3_key: Optional[str] = None,
) -> tuple[Artifact, bool]:
"""
Get existing artifact or create new one atomically.
Uses INSERT ... ON CONFLICT DO UPDATE to handle races.
If artifact exists, increments ref_count.
Args:
sha256: Content hash (primary key)
size: File size in bytes
filename: Original filename
content_type: MIME type
created_by: User who created the artifact
s3_key: S3 storage key (defaults to standard path)
Returns:
(artifact, created) tuple where created is True for new artifacts.
"""
if s3_key is None:
s3_key = f"fruits/{sha256[:2]}/{sha256[2:4]}/{sha256}"
stmt = pg_insert(Artifact).values(
id=sha256,
size=size,
original_name=filename,
content_type=content_type,
ref_count=1,
created_by=created_by,
s3_key=s3_key,
).on_conflict_do_update(
index_elements=['id'],
set_={'ref_count': Artifact.ref_count + 1}
).returning(Artifact)
result = self.db.execute(stmt)
artifact = result.scalar_one()
# Check if this was an insert or update by comparing ref_count
# ref_count=1 means new, >1 means existing
created = artifact.ref_count == 1
return artifact, created
def batch_upsert_dependencies(
self,
artifact_id: str,
dependencies: list[tuple[str, str, str]],
) -> int:
"""
Insert dependencies in a single batch operation.
Uses ON CONFLICT DO NOTHING to skip duplicates.
Args:
artifact_id: SHA256 of the artifact
dependencies: List of (project, package, version_constraint)
Returns:
Number of dependencies inserted.
"""
if not dependencies:
return 0
values = self._format_dependency_values(artifact_id, dependencies)
stmt = pg_insert(ArtifactDependency).values(values)
stmt = stmt.on_conflict_do_nothing(
index_elements=['artifact_id', 'dependency_project', 'dependency_package']
)
result = self.db.execute(stmt)
return result.rowcount
def get_cached_url_with_artifact(
self,
url_hash: str,
) -> Optional[tuple[CachedUrl, Artifact]]:
"""
Get cached URL and its artifact in a single query.
Args:
url_hash: SHA256 of the URL
Returns:
(CachedUrl, Artifact) tuple or None if not found.
"""
result = (
self.db.query(CachedUrl, Artifact)
.join(Artifact, CachedUrl.artifact_id == Artifact.id)
.filter(CachedUrl.url_hash == url_hash)
.first()
)
return result
def get_artifact_dependencies(
self,
artifact_id: str,
) -> list[ArtifactDependency]:
"""
Get all dependencies for an artifact in a single query.
Args:
artifact_id: SHA256 of the artifact
Returns:
List of ArtifactDependency objects.
"""
return (
self.db.query(ArtifactDependency)
.filter(ArtifactDependency.artifact_id == artifact_id)
.all()
)

File diff suppressed because it is too large Load Diff

179
backend/app/http_client.py Normal file
View File

@@ -0,0 +1,179 @@
"""
HTTP client manager with connection pooling and lifecycle management.
Provides:
- Shared connection pools for upstream requests
- Per-upstream client isolation when needed
- Thread pool for blocking I/O operations
- FastAPI lifespan integration
"""
import asyncio
import logging
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Callable, Optional
import httpx
from .config import Settings
logger = logging.getLogger(__name__)
class HttpClientManager:
"""
Manages httpx.AsyncClient pools with FastAPI lifespan integration.
Features:
- Default shared pool for general requests
- Per-upstream pools for sources needing specific config/auth
- Dedicated thread pool for blocking operations
- Graceful shutdown
"""
def __init__(self, settings: Settings):
self.max_connections = settings.http_max_connections
self.max_keepalive = settings.http_max_keepalive
self.connect_timeout = settings.http_connect_timeout
self.read_timeout = settings.http_read_timeout
self.worker_threads = settings.http_worker_threads
self._default_client: Optional[httpx.AsyncClient] = None
self._upstream_clients: dict[str, httpx.AsyncClient] = {}
self._executor: Optional[ThreadPoolExecutor] = None
self._started = False
async def startup(self) -> None:
"""Initialize clients and thread pool. Called by FastAPI lifespan."""
if self._started:
return
logger.info(
f"Starting HttpClientManager: max_connections={self.max_connections}, "
f"worker_threads={self.worker_threads}"
)
# Create connection limits
limits = httpx.Limits(
max_connections=self.max_connections,
max_keepalive_connections=self.max_keepalive,
)
# Create timeout config
timeout = httpx.Timeout(
connect=self.connect_timeout,
read=self.read_timeout,
write=self.read_timeout,
pool=self.connect_timeout,
)
# Create default client
self._default_client = httpx.AsyncClient(
limits=limits,
timeout=timeout,
follow_redirects=False, # Handle redirects manually for auth
)
# Create thread pool for blocking operations
self._executor = ThreadPoolExecutor(
max_workers=self.worker_threads,
thread_name_prefix="orchard-blocking-",
)
self._started = True
logger.info("HttpClientManager started")
async def shutdown(self) -> None:
"""Close all clients and thread pool. Called by FastAPI lifespan."""
if not self._started:
return
logger.info("Shutting down HttpClientManager")
# Close default client
if self._default_client:
await self._default_client.aclose()
self._default_client = None
# Close upstream-specific clients
for name, client in self._upstream_clients.items():
logger.debug(f"Closing upstream client: {name}")
await client.aclose()
self._upstream_clients.clear()
# Shutdown thread pool
if self._executor:
self._executor.shutdown(wait=True)
self._executor = None
self._started = False
logger.info("HttpClientManager shutdown complete")
def get_client(self, upstream_name: Optional[str] = None) -> httpx.AsyncClient:
"""
Get HTTP client for making requests.
Args:
upstream_name: Optional upstream source name for dedicated pool.
If None, returns the default shared client.
Returns:
httpx.AsyncClient configured for the request.
Raises:
RuntimeError: If manager not started.
"""
if not self._started or not self._default_client:
raise RuntimeError("HttpClientManager not started. Call startup() first.")
if upstream_name and upstream_name in self._upstream_clients:
return self._upstream_clients[upstream_name]
return self._default_client
async def run_blocking(self, func: Callable[..., Any], *args: Any) -> Any:
"""
Run a blocking function in the thread pool.
Use this for:
- File I/O operations
- Archive extraction (zipfile, tarfile)
- Hash computation on large data
Args:
func: Synchronous function to execute
*args: Arguments to pass to the function
Returns:
The function's return value.
"""
if not self._executor:
raise RuntimeError("HttpClientManager not started. Call startup() first.")
loop = asyncio.get_running_loop()
return await loop.run_in_executor(self._executor, func, *args)
@property
def active_connections(self) -> int:
"""Get approximate number of active connections (for health checks)."""
if not self._default_client:
return 0
# httpx doesn't expose this directly, return pool size as approximation
return self.max_connections
@property
def pool_size(self) -> int:
"""Get configured pool size."""
return self.max_connections
@property
def executor_active(self) -> int:
"""Get number of active thread pool workers."""
if not self._executor:
return 0
return len(self._executor._threads)
@property
def executor_max(self) -> int:
"""Get max thread pool workers."""
return self.worker_threads

View File

@@ -15,6 +15,8 @@ from .pypi_proxy import router as pypi_router
from .seed import seed_database
from .auth import create_default_admin
from .rate_limit import limiter
from .http_client import HttpClientManager
from .cache_service import CacheService
settings = get_settings()
logging.basicConfig(level=logging.INFO)
@@ -38,6 +40,17 @@ async def lifespan(app: FastAPI):
finally:
db.close()
# Initialize infrastructure services
logger.info("Initializing infrastructure services...")
app.state.http_client = HttpClientManager(settings)
await app.state.http_client.startup()
app.state.cache = CacheService(settings)
await app.state.cache.startup()
logger.info("Infrastructure services ready")
# Seed test data in development mode
if settings.is_development:
logger.info(f"Running in {settings.env} mode - checking for seed data")
@@ -50,7 +63,12 @@ async def lifespan(app: FastAPI):
logger.info(f"Running in {settings.env} mode - skipping seed data")
yield
# Shutdown: cleanup if needed
# Shutdown infrastructure services
logger.info("Shutting down infrastructure services...")
await app.state.http_client.shutdown()
await app.state.cache.shutdown()
logger.info("Shutdown complete")
app = FastAPI(

View File

@@ -71,7 +71,6 @@ class Package(Base):
)
project = relationship("Project", back_populates="packages")
tags = relationship("Tag", back_populates="package", cascade="all, delete-orphan")
uploads = relationship(
"Upload", back_populates="package", cascade="all, delete-orphan"
)
@@ -120,7 +119,6 @@ class Artifact(Base):
ref_count = Column(Integer, default=1)
s3_key = Column(String(1024), nullable=False)
tags = relationship("Tag", back_populates="artifact")
uploads = relationship("Upload", back_populates="artifact")
versions = relationship("PackageVersion", back_populates="artifact")
dependencies = relationship(
@@ -151,65 +149,6 @@ class Artifact(Base):
)
class Tag(Base):
__tablename__ = "tags"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
package_id = Column(
UUID(as_uuid=True),
ForeignKey("packages.id", ondelete="CASCADE"),
nullable=False,
)
name = Column(String(255), nullable=False)
artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
updated_at = Column(
DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow
)
created_by = Column(String(255), nullable=False)
package = relationship("Package", back_populates="tags")
artifact = relationship("Artifact", back_populates="tags")
history = relationship(
"TagHistory", back_populates="tag", cascade="all, delete-orphan"
)
__table_args__ = (
Index("idx_tags_package_id", "package_id"),
Index("idx_tags_artifact_id", "artifact_id"),
Index(
"idx_tags_package_name", "package_id", "name", unique=True
), # Composite unique index
Index(
"idx_tags_package_created_at", "package_id", "created_at"
), # For recent tags queries
)
class TagHistory(Base):
__tablename__ = "tag_history"
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
tag_id = Column(
UUID(as_uuid=True), ForeignKey("tags.id", ondelete="CASCADE"), nullable=False
)
old_artifact_id = Column(String(64), ForeignKey("artifacts.id"))
new_artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False)
change_type = Column(String(20), nullable=False, default="update")
changed_at = Column(DateTime(timezone=True), default=datetime.utcnow)
changed_by = Column(String(255), nullable=False)
tag = relationship("Tag", back_populates="history")
__table_args__ = (
Index("idx_tag_history_tag_id", "tag_id"),
Index("idx_tag_history_changed_at", "changed_at"),
CheckConstraint(
"change_type IN ('create', 'update', 'delete')", name="check_change_type"
),
)
class PackageVersion(Base):
"""Immutable version record for a package-artifact relationship.
@@ -249,7 +188,7 @@ class Upload(Base):
artifact_id = Column(String(64), ForeignKey("artifacts.id"), nullable=False)
package_id = Column(UUID(as_uuid=True), ForeignKey("packages.id"), nullable=False)
original_name = Column(String(1024))
tag_name = Column(String(255)) # Tag assigned during upload
version = Column(String(255)) # Version assigned during upload
user_agent = Column(String(512)) # Client identification
duration_ms = Column(Integer) # Upload timing in milliseconds
deduplicated = Column(Boolean, default=False) # Whether artifact was deduplicated
@@ -524,8 +463,8 @@ class PackageHistory(Base):
class ArtifactDependency(Base):
"""Dependency declared by an artifact on another package.
Each artifact can declare dependencies on other packages, specifying either
an exact version or a tag. This enables recursive dependency resolution.
Each artifact can declare dependencies on other packages, specifying a version.
This enables recursive dependency resolution.
"""
__tablename__ = "artifact_dependencies"
@@ -538,20 +477,13 @@ class ArtifactDependency(Base):
)
dependency_project = Column(String(255), nullable=False)
dependency_package = Column(String(255), nullable=False)
version_constraint = Column(String(255), nullable=True)
tag_constraint = Column(String(255), nullable=True)
version_constraint = Column(String(255), nullable=False)
created_at = Column(DateTime(timezone=True), default=datetime.utcnow)
# Relationship to the artifact that declares this dependency
artifact = relationship("Artifact", back_populates="dependencies")
__table_args__ = (
# Exactly one of version_constraint or tag_constraint must be set
CheckConstraint(
"(version_constraint IS NOT NULL AND tag_constraint IS NULL) OR "
"(version_constraint IS NULL AND tag_constraint IS NOT NULL)",
name="check_constraint_type",
),
# Each artifact can only depend on a specific project/package once
Index(
"idx_artifact_dependencies_artifact_id",

View File

@@ -12,7 +12,6 @@ from .models import (
Project,
Package,
Artifact,
Tag,
Upload,
PackageVersion,
ArtifactDependency,
@@ -60,7 +59,6 @@ def purge_seed_data(db: Session) -> dict:
results = {
"dependencies_deleted": 0,
"tags_deleted": 0,
"versions_deleted": 0,
"uploads_deleted": 0,
"artifacts_deleted": 0,
@@ -103,15 +101,7 @@ def purge_seed_data(db: Session) -> dict:
results["dependencies_deleted"] = count
logger.info(f"Deleted {count} artifact dependencies")
# 2. Delete tags
if seed_package_ids:
count = db.query(Tag).filter(Tag.package_id.in_(seed_package_ids)).delete(
synchronize_session=False
)
results["tags_deleted"] = count
logger.info(f"Deleted {count} tags")
# 3. Delete package versions
# 2. Delete package versions
if seed_package_ids:
count = db.query(PackageVersion).filter(
PackageVersion.package_id.in_(seed_package_ids)
@@ -119,7 +109,7 @@ def purge_seed_data(db: Session) -> dict:
results["versions_deleted"] = count
logger.info(f"Deleted {count} package versions")
# 4. Delete uploads
# 3. Delete uploads
if seed_package_ids:
count = db.query(Upload).filter(Upload.package_id.in_(seed_package_ids)).delete(
synchronize_session=False
@@ -127,7 +117,7 @@ def purge_seed_data(db: Session) -> dict:
results["uploads_deleted"] = count
logger.info(f"Deleted {count} uploads")
# 5. Delete S3 objects for seed artifacts
# 4. Delete S3 objects for seed artifacts
if seed_artifact_ids:
seed_artifacts = db.query(Artifact).filter(Artifact.id.in_(seed_artifact_ids)).all()
for artifact in seed_artifacts:
@@ -139,8 +129,8 @@ def purge_seed_data(db: Session) -> dict:
logger.warning(f"Failed to delete S3 object {artifact.s3_key}: {e}")
logger.info(f"Deleted {results['s3_objects_deleted']} S3 objects")
# 6. Delete artifacts (only those with ref_count that would be 0 after our deletions)
# Since we deleted all tags/versions pointing to these artifacts, we can delete them
# 5. Delete artifacts (only those with ref_count that would be 0 after our deletions)
# Since we deleted all versions pointing to these artifacts, we can delete them
if seed_artifact_ids:
count = db.query(Artifact).filter(Artifact.id.in_(seed_artifact_ids)).delete(
synchronize_session=False
@@ -148,7 +138,7 @@ def purge_seed_data(db: Session) -> dict:
results["artifacts_deleted"] = count
logger.info(f"Deleted {count} artifacts")
# 7. Delete packages
# 6. Delete packages
if seed_package_ids:
count = db.query(Package).filter(Package.id.in_(seed_package_ids)).delete(
synchronize_session=False
@@ -156,7 +146,7 @@ def purge_seed_data(db: Session) -> dict:
results["packages_deleted"] = count
logger.info(f"Deleted {count} packages")
# 8. Delete access permissions for seed projects
# 7. Delete access permissions for seed projects
if seed_project_ids:
count = db.query(AccessPermission).filter(
AccessPermission.project_id.in_(seed_project_ids)
@@ -164,14 +154,14 @@ def purge_seed_data(db: Session) -> dict:
results["permissions_deleted"] = count
logger.info(f"Deleted {count} access permissions")
# 9. Delete seed projects
# 8. Delete seed projects
count = db.query(Project).filter(Project.name.in_(SEED_PROJECT_NAMES)).delete(
synchronize_session=False
)
results["projects_deleted"] = count
logger.info(f"Deleted {count} projects")
# 10. Find and delete seed team
# 9. Find and delete seed team
seed_team = db.query(Team).filter(Team.slug == SEED_TEAM_SLUG).first()
if seed_team:
# Delete team memberships first
@@ -186,7 +176,7 @@ def purge_seed_data(db: Session) -> dict:
results["teams_deleted"] = 1
logger.info(f"Deleted team: {SEED_TEAM_SLUG}")
# 11. Delete seed users (but NOT admin)
# 10. Delete seed users (but NOT admin)
seed_users = db.query(User).filter(User.username.in_(SEED_USERNAMES)).all()
for user in seed_users:
# Delete any remaining team memberships for this user

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,426 @@
"""
Registry client abstraction for upstream package registries.
Provides a pluggable interface for fetching packages from upstream registries
(PyPI, npm, Maven, etc.) during dependency resolution with auto-fetch enabled.
"""
import hashlib
import logging
import os
import re
import tempfile
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Optional, TYPE_CHECKING
from urllib.parse import urljoin, urlparse
import httpx
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
from sqlalchemy.orm import Session
if TYPE_CHECKING:
from .storage import S3Storage
from .http_client import HttpClientManager
logger = logging.getLogger(__name__)
@dataclass
class VersionInfo:
"""Information about a package version from an upstream registry."""
version: str
download_url: str
filename: str
sha256: Optional[str] = None
size: Optional[int] = None
content_type: Optional[str] = None
@dataclass
class FetchResult:
"""Result of fetching a package from upstream."""
artifact_id: str # SHA256 hash
size: int
version: str
filename: str
already_cached: bool = False
class RegistryClient(ABC):
"""Abstract base class for upstream registry clients."""
@property
@abstractmethod
def source_type(self) -> str:
"""Return the source type this client handles (e.g., 'pypi', 'npm')."""
pass
@abstractmethod
async def get_available_versions(self, package_name: str) -> List[str]:
"""
Get all available versions of a package from upstream.
Args:
package_name: The normalized package name
Returns:
List of version strings, sorted from oldest to newest
"""
pass
@abstractmethod
async def resolve_constraint(
self, package_name: str, constraint: str
) -> Optional[VersionInfo]:
"""
Find the best version matching a constraint.
Args:
package_name: The normalized package name
constraint: Version constraint (e.g., '>=1.9', '<2.0,>=1.5', '*')
Returns:
VersionInfo with download URL, or None if no matching version found
"""
pass
@abstractmethod
async def fetch_package(
self,
package_name: str,
version_info: VersionInfo,
db: Session,
storage: "S3Storage",
) -> Optional[FetchResult]:
"""
Fetch and cache a package from upstream.
Args:
package_name: The normalized package name
version_info: Version details including download URL
db: Database session for creating records
storage: S3 storage for caching the artifact
Returns:
FetchResult with artifact_id, or None if fetch failed
"""
pass
class PyPIRegistryClient(RegistryClient):
"""PyPI registry client using the JSON API."""
# Timeout configuration for PyPI requests
CONNECT_TIMEOUT = 30.0
READ_TIMEOUT = 60.0
DOWNLOAD_TIMEOUT = 300.0 # Longer timeout for file downloads
def __init__(
self,
http_client: httpx.AsyncClient,
upstream_sources: List,
pypi_api_url: str = "https://pypi.org/pypi",
):
"""
Initialize PyPI registry client.
Args:
http_client: Shared async HTTP client
upstream_sources: List of configured upstream sources for auth
pypi_api_url: Base URL for PyPI JSON API
"""
self.client = http_client
self.sources = upstream_sources
self.api_url = pypi_api_url
@property
def source_type(self) -> str:
return "pypi"
def _normalize_package_name(self, name: str) -> str:
"""Normalize a PyPI package name per PEP 503."""
return re.sub(r"[-_.]+", "-", name).lower()
def _get_auth_headers(self) -> dict:
"""Get authentication headers from configured sources."""
headers = {"User-Agent": "Orchard-Registry-Client/1.0"}
if self.sources:
source = self.sources[0]
if hasattr(source, "auth_type"):
if source.auth_type == "bearer":
password = (
source.get_password()
if hasattr(source, "get_password")
else getattr(source, "password", None)
)
if password:
headers["Authorization"] = f"Bearer {password}"
elif source.auth_type == "api_key":
custom_headers = (
source.get_headers()
if hasattr(source, "get_headers")
else {}
)
if custom_headers:
headers.update(custom_headers)
return headers
def _get_basic_auth(self) -> Optional[tuple]:
"""Get basic auth credentials if configured."""
if self.sources:
source = self.sources[0]
if hasattr(source, "auth_type") and source.auth_type == "basic":
username = getattr(source, "username", None)
if username:
password = (
source.get_password()
if hasattr(source, "get_password")
else getattr(source, "password", "")
)
return (username, password or "")
return None
async def get_available_versions(self, package_name: str) -> List[str]:
"""Get all available versions from PyPI JSON API."""
normalized = self._normalize_package_name(package_name)
url = f"{self.api_url}/{normalized}/json"
headers = self._get_auth_headers()
auth = self._get_basic_auth()
timeout = httpx.Timeout(self.READ_TIMEOUT, connect=self.CONNECT_TIMEOUT)
try:
response = await self.client.get(
url, headers=headers, auth=auth, timeout=timeout
)
if response.status_code == 404:
logger.debug(f"Package {normalized} not found on PyPI")
return []
if response.status_code != 200:
logger.warning(
f"PyPI API returned {response.status_code} for {normalized}"
)
return []
data = response.json()
releases = data.get("releases", {})
# Filter to valid versions and sort
versions = []
for v in releases.keys():
try:
Version(v)
versions.append(v)
except InvalidVersion:
continue
versions.sort(key=lambda x: Version(x))
return versions
except httpx.RequestError as e:
logger.warning(f"Failed to query PyPI for {normalized}: {e}")
return []
except Exception as e:
logger.warning(f"Error parsing PyPI response for {normalized}: {e}")
return []
async def resolve_constraint(
self, package_name: str, constraint: str
) -> Optional[VersionInfo]:
"""Find best version matching constraint from PyPI."""
normalized = self._normalize_package_name(package_name)
url = f"{self.api_url}/{normalized}/json"
headers = self._get_auth_headers()
auth = self._get_basic_auth()
timeout = httpx.Timeout(self.READ_TIMEOUT, connect=self.CONNECT_TIMEOUT)
try:
response = await self.client.get(
url, headers=headers, auth=auth, timeout=timeout
)
if response.status_code == 404:
logger.debug(f"Package {normalized} not found on PyPI")
return None
if response.status_code != 200:
logger.warning(
f"PyPI API returned {response.status_code} for {normalized}"
)
return None
data = response.json()
releases = data.get("releases", {})
# Handle wildcard - return latest version
if constraint == "*":
latest_version = data.get("info", {}).get("version")
if latest_version and latest_version in releases:
return self._get_version_info(
normalized, latest_version, releases[latest_version]
)
return None
# Parse constraint
# If constraint looks like a bare version (no operator), treat as exact match
# e.g., "2025.10.5" -> "==2025.10.5"
effective_constraint = constraint
if constraint and constraint[0].isdigit():
effective_constraint = f"=={constraint}"
logger.debug(
f"Bare version '{constraint}' for {normalized}, "
f"treating as exact match '{effective_constraint}'"
)
try:
specifier = SpecifierSet(effective_constraint)
except InvalidSpecifier:
# Invalid constraint - treat as wildcard
logger.warning(
f"Invalid version constraint '{constraint}' for {normalized}, "
"treating as wildcard"
)
latest_version = data.get("info", {}).get("version")
if latest_version and latest_version in releases:
return self._get_version_info(
normalized, latest_version, releases[latest_version]
)
return None
# Find matching versions
matching = []
for v_str, files in releases.items():
if not files: # Skip versions with no files
continue
try:
v = Version(v_str)
if v in specifier:
matching.append((v_str, v, files))
except InvalidVersion:
continue
if not matching:
logger.debug(
f"No versions of {normalized} match constraint '{constraint}'"
)
return None
# Sort by version and return highest match
matching.sort(key=lambda x: x[1], reverse=True)
best_version, _, best_files = matching[0]
return self._get_version_info(normalized, best_version, best_files)
except httpx.RequestError as e:
logger.warning(f"Failed to query PyPI for {normalized}: {e}")
return None
except Exception as e:
logger.warning(f"Error resolving {normalized}@{constraint}: {e}")
return None
def _get_version_info(
self, package_name: str, version: str, files: List[dict]
) -> Optional[VersionInfo]:
"""Extract download info from PyPI release files."""
if not files:
return None
# Prefer wheel over sdist
wheel_file = None
sdist_file = None
for f in files:
filename = f.get("filename", "")
if filename.endswith(".whl"):
# Prefer platform-agnostic wheels
if "py3-none-any" in filename or wheel_file is None:
wheel_file = f
elif filename.endswith(".tar.gz") and sdist_file is None:
sdist_file = f
selected = wheel_file or sdist_file
if not selected:
# Fall back to first available file
selected = files[0]
return VersionInfo(
version=version,
download_url=selected.get("url", ""),
filename=selected.get("filename", ""),
sha256=selected.get("digests", {}).get("sha256"),
size=selected.get("size"),
content_type="application/zip"
if selected.get("filename", "").endswith(".whl")
else "application/gzip",
)
async def fetch_package(
self,
package_name: str,
version_info: VersionInfo,
db: Session,
storage: "S3Storage",
) -> Optional[FetchResult]:
"""Fetch and cache a PyPI package."""
# Import here to avoid circular imports
from .pypi_proxy import fetch_and_cache_pypi_package
normalized = self._normalize_package_name(package_name)
logger.info(
f"Fetching {normalized}=={version_info.version} from upstream PyPI"
)
result = await fetch_and_cache_pypi_package(
db=db,
storage=storage,
http_client=self.client,
package_name=normalized,
filename=version_info.filename,
download_url=version_info.download_url,
expected_sha256=version_info.sha256,
)
if result is None:
return None
return FetchResult(
artifact_id=result["artifact_id"],
size=result["size"],
version=version_info.version,
filename=version_info.filename,
already_cached=result.get("already_cached", False),
)
def get_registry_client(
source_type: str,
http_client: httpx.AsyncClient,
upstream_sources: List,
) -> Optional[RegistryClient]:
"""
Factory function to get a registry client for a source type.
Args:
source_type: The registry type ('pypi', 'npm', etc.)
http_client: Shared async HTTP client
upstream_sources: List of configured upstream sources
Returns:
RegistryClient for the source type, or None if not supported
"""
if source_type == "pypi":
# Filter to PyPI sources
pypi_sources = [s for s in upstream_sources if getattr(s, "source_type", "") == "pypi"]
return PyPIRegistryClient(http_client, pypi_sources)
# Future: Add npm, maven, etc.
logger.debug(f"No registry client available for source type: {source_type}")
return None

View File

@@ -9,7 +9,6 @@ from .base import BaseRepository
from .project import ProjectRepository
from .package import PackageRepository
from .artifact import ArtifactRepository
from .tag import TagRepository
from .upload import UploadRepository
__all__ = [
@@ -17,6 +16,5 @@ __all__ = [
"ProjectRepository",
"PackageRepository",
"ArtifactRepository",
"TagRepository",
"UploadRepository",
]

View File

@@ -8,7 +8,7 @@ from sqlalchemy import func, or_
from uuid import UUID
from .base import BaseRepository
from ..models import Artifact, Tag, Upload, Package, Project
from ..models import Artifact, PackageVersion, Upload, Package, Project
class ArtifactRepository(BaseRepository[Artifact]):
@@ -77,14 +77,14 @@ class ArtifactRepository(BaseRepository[Artifact]):
.all()
)
def get_artifacts_without_tags(self, limit: int = 100) -> List[Artifact]:
"""Get artifacts that have no tags pointing to them."""
# Subquery to find artifact IDs that have tags
tagged_artifacts = self.db.query(Tag.artifact_id).distinct().subquery()
def get_artifacts_without_versions(self, limit: int = 100) -> List[Artifact]:
"""Get artifacts that have no versions pointing to them."""
# Subquery to find artifact IDs that have versions
versioned_artifacts = self.db.query(PackageVersion.artifact_id).distinct().subquery()
return (
self.db.query(Artifact)
.filter(~Artifact.id.in_(tagged_artifacts))
.filter(~Artifact.id.in_(versioned_artifacts))
.limit(limit)
.all()
)
@@ -115,34 +115,34 @@ class ArtifactRepository(BaseRepository[Artifact]):
return artifacts, total
def get_referencing_tags(self, artifact_id: str) -> List[Tuple[Tag, Package, Project]]:
"""Get all tags referencing this artifact with package and project info."""
def get_referencing_versions(self, artifact_id: str) -> List[Tuple[PackageVersion, Package, Project]]:
"""Get all versions referencing this artifact with package and project info."""
return (
self.db.query(Tag, Package, Project)
.join(Package, Tag.package_id == Package.id)
self.db.query(PackageVersion, Package, Project)
.join(Package, PackageVersion.package_id == Package.id)
.join(Project, Package.project_id == Project.id)
.filter(Tag.artifact_id == artifact_id)
.filter(PackageVersion.artifact_id == artifact_id)
.all()
)
def search(self, query_str: str, limit: int = 10) -> List[Tuple[Tag, Artifact, str, str]]:
def search(self, query_str: str, limit: int = 10) -> List[Tuple[PackageVersion, Artifact, str, str]]:
"""
Search artifacts by tag name or original filename.
Returns (tag, artifact, package_name, project_name) tuples.
Search artifacts by version or original filename.
Returns (version, artifact, package_name, project_name) tuples.
"""
search_lower = query_str.lower()
return (
self.db.query(Tag, Artifact, Package.name, Project.name)
.join(Artifact, Tag.artifact_id == Artifact.id)
.join(Package, Tag.package_id == Package.id)
self.db.query(PackageVersion, Artifact, Package.name, Project.name)
.join(Artifact, PackageVersion.artifact_id == Artifact.id)
.join(Package, PackageVersion.package_id == Package.id)
.join(Project, Package.project_id == Project.id)
.filter(
or_(
func.lower(Tag.name).contains(search_lower),
func.lower(PackageVersion.version).contains(search_lower),
func.lower(Artifact.original_name).contains(search_lower)
)
)
.order_by(Tag.name)
.order_by(PackageVersion.version)
.limit(limit)
.all()
)

View File

@@ -8,7 +8,7 @@ from sqlalchemy import func, or_, asc, desc
from uuid import UUID
from .base import BaseRepository
from ..models import Package, Project, Tag, Upload, Artifact
from ..models import Package, Project, PackageVersion, Upload, Artifact
class PackageRepository(BaseRepository[Package]):
@@ -136,10 +136,10 @@ class PackageRepository(BaseRepository[Package]):
return self.update(package, **updates)
def get_stats(self, package_id: UUID) -> dict:
"""Get package statistics (tag count, artifact count, total size)."""
tag_count = (
self.db.query(func.count(Tag.id))
.filter(Tag.package_id == package_id)
"""Get package statistics (version count, artifact count, total size)."""
version_count = (
self.db.query(func.count(PackageVersion.id))
.filter(PackageVersion.package_id == package_id)
.scalar() or 0
)
@@ -154,7 +154,7 @@ class PackageRepository(BaseRepository[Package]):
)
return {
"tag_count": tag_count,
"version_count": version_count,
"artifact_count": artifact_stats[0] if artifact_stats else 0,
"total_size": artifact_stats[1] if artifact_stats else 0,
}

View File

@@ -1,168 +0,0 @@
"""
Tag repository for data access operations.
"""
from typing import Optional, List, Tuple
from sqlalchemy.orm import Session
from sqlalchemy import func, or_, asc, desc
from uuid import UUID
from .base import BaseRepository
from ..models import Tag, TagHistory, Artifact, Package, Project
class TagRepository(BaseRepository[Tag]):
"""Repository for Tag entity operations."""
model = Tag
def get_by_name(self, package_id: UUID, name: str) -> Optional[Tag]:
"""Get tag by name within a package."""
return (
self.db.query(Tag)
.filter(Tag.package_id == package_id, Tag.name == name)
.first()
)
def get_with_artifact(self, package_id: UUID, name: str) -> Optional[Tuple[Tag, Artifact]]:
"""Get tag with its artifact."""
return (
self.db.query(Tag, Artifact)
.join(Artifact, Tag.artifact_id == Artifact.id)
.filter(Tag.package_id == package_id, Tag.name == name)
.first()
)
def exists_by_name(self, package_id: UUID, name: str) -> bool:
"""Check if tag with name exists in package."""
return self.db.query(
self.db.query(Tag)
.filter(Tag.package_id == package_id, Tag.name == name)
.exists()
).scalar()
def list_by_package(
self,
package_id: UUID,
page: int = 1,
limit: int = 20,
search: Optional[str] = None,
sort: str = "name",
order: str = "asc",
) -> Tuple[List[Tuple[Tag, Artifact]], int]:
"""
List tags in a package with artifact metadata.
Returns tuple of ((tag, artifact) tuples, total_count).
"""
query = (
self.db.query(Tag, Artifact)
.join(Artifact, Tag.artifact_id == Artifact.id)
.filter(Tag.package_id == package_id)
)
# Apply search filter (tag name or artifact original filename)
if search:
search_lower = search.lower()
query = query.filter(
or_(
func.lower(Tag.name).contains(search_lower),
func.lower(Artifact.original_name).contains(search_lower)
)
)
# Get total count
total = query.count()
# Apply sorting
sort_columns = {
"name": Tag.name,
"created_at": Tag.created_at,
}
sort_column = sort_columns.get(sort, Tag.name)
if order == "desc":
query = query.order_by(desc(sort_column))
else:
query = query.order_by(asc(sort_column))
# Apply pagination
offset = (page - 1) * limit
results = query.offset(offset).limit(limit).all()
return results, total
def create_tag(
self,
package_id: UUID,
name: str,
artifact_id: str,
created_by: str,
) -> Tag:
"""Create a new tag."""
return self.create(
package_id=package_id,
name=name,
artifact_id=artifact_id,
created_by=created_by,
)
def update_artifact(
self,
tag: Tag,
new_artifact_id: str,
changed_by: str,
record_history: bool = True,
) -> Tag:
"""
Update tag to point to a different artifact.
Optionally records change in tag history.
"""
old_artifact_id = tag.artifact_id
if record_history and old_artifact_id != new_artifact_id:
history = TagHistory(
tag_id=tag.id,
old_artifact_id=old_artifact_id,
new_artifact_id=new_artifact_id,
changed_by=changed_by,
)
self.db.add(history)
tag.artifact_id = new_artifact_id
tag.created_by = changed_by
self.db.flush()
return tag
def get_history(self, tag_id: UUID) -> List[TagHistory]:
"""Get tag change history."""
return (
self.db.query(TagHistory)
.filter(TagHistory.tag_id == tag_id)
.order_by(TagHistory.changed_at.desc())
.all()
)
def get_latest_in_package(self, package_id: UUID) -> Optional[Tag]:
"""Get the most recently created/updated tag in a package."""
return (
self.db.query(Tag)
.filter(Tag.package_id == package_id)
.order_by(Tag.created_at.desc())
.first()
)
def get_by_artifact(self, artifact_id: str) -> List[Tag]:
"""Get all tags pointing to an artifact."""
return (
self.db.query(Tag)
.filter(Tag.artifact_id == artifact_id)
.all()
)
def count_by_artifact(self, artifact_id: str) -> int:
"""Count tags pointing to an artifact."""
return (
self.db.query(func.count(Tag.id))
.filter(Tag.artifact_id == artifact_id)
.scalar() or 0
)

File diff suppressed because it is too large Load Diff

View File

@@ -33,6 +33,7 @@ class ProjectResponse(BaseModel):
name: str
description: Optional[str]
is_public: bool
is_system: bool = False
created_at: datetime
updated_at: datetime
created_by: str
@@ -113,14 +114,6 @@ class PackageUpdate(BaseModel):
platform: Optional[str] = None
class TagSummary(BaseModel):
"""Lightweight tag info for embedding in package responses"""
name: str
artifact_id: str
created_at: datetime
class PackageDetailResponse(BaseModel):
"""Package with aggregated metadata"""
@@ -133,13 +126,9 @@ class PackageDetailResponse(BaseModel):
created_at: datetime
updated_at: datetime
# Aggregated fields
tag_count: int = 0
artifact_count: int = 0
total_size: int = 0
latest_tag: Optional[str] = None
latest_upload_at: Optional[datetime] = None
# Recent tags (limit 5)
recent_tags: List[TagSummary] = []
class Config:
from_attributes = True
@@ -164,79 +153,6 @@ class ArtifactResponse(BaseModel):
from_attributes = True
# Tag schemas
class TagCreate(BaseModel):
name: str
artifact_id: str
class TagResponse(BaseModel):
id: UUID
package_id: UUID
name: str
artifact_id: str
created_at: datetime
created_by: str
version: Optional[str] = None # Version of the artifact this tag points to
class Config:
from_attributes = True
class TagDetailResponse(BaseModel):
"""Tag with embedded artifact metadata"""
id: UUID
package_id: UUID
name: str
artifact_id: str
created_at: datetime
created_by: str
version: Optional[str] = None # Version of the artifact this tag points to
# Artifact metadata
artifact_size: int
artifact_content_type: Optional[str]
artifact_original_name: Optional[str]
artifact_created_at: datetime
artifact_format_metadata: Optional[Dict[str, Any]] = None
class Config:
from_attributes = True
class TagHistoryResponse(BaseModel):
"""History entry for tag changes"""
id: UUID
tag_id: UUID
old_artifact_id: Optional[str]
new_artifact_id: str
changed_at: datetime
changed_by: str
class Config:
from_attributes = True
class TagHistoryDetailResponse(BaseModel):
"""Tag history with artifact metadata for each version"""
id: UUID
tag_id: UUID
tag_name: str
old_artifact_id: Optional[str]
new_artifact_id: str
changed_at: datetime
changed_by: str
# Artifact metadata for new artifact
artifact_size: int
artifact_original_name: Optional[str]
artifact_content_type: Optional[str]
class Config:
from_attributes = True
# Audit log schemas
class AuditLogResponse(BaseModel):
"""Audit log entry response"""
@@ -263,7 +179,7 @@ class UploadHistoryResponse(BaseModel):
package_name: str
project_name: str
original_name: Optional[str]
tag_name: Optional[str]
version: Optional[str]
uploaded_at: datetime
uploaded_by: str
source_ip: Optional[str]
@@ -294,10 +210,10 @@ class ArtifactProvenanceResponse(BaseModel):
# Usage statistics
upload_count: int
# References
packages: List[Dict[str, Any]] # List of {project_name, package_name, tag_names}
tags: List[
packages: List[Dict[str, Any]] # List of {project_name, package_name, versions}
versions: List[
Dict[str, Any]
] # List of {project_name, package_name, tag_name, created_at}
] # List of {project_name, package_name, version, created_at}
# Upload history
uploads: List[Dict[str, Any]] # List of upload events
@@ -305,18 +221,8 @@ class ArtifactProvenanceResponse(BaseModel):
from_attributes = True
class ArtifactTagInfo(BaseModel):
"""Tag info for embedding in artifact responses"""
id: UUID
name: str
package_id: UUID
package_name: str
project_name: str
class ArtifactDetailResponse(BaseModel):
"""Artifact with list of tags/packages referencing it"""
"""Artifact with metadata"""
id: str
sha256: str # Explicit SHA256 field (same as id)
@@ -330,14 +236,14 @@ class ArtifactDetailResponse(BaseModel):
created_by: str
ref_count: int
format_metadata: Optional[Dict[str, Any]] = None
tags: List[ArtifactTagInfo] = []
versions: List[Dict[str, Any]] = [] # List of {version, package_name, project_name}
class Config:
from_attributes = True
class PackageArtifactResponse(BaseModel):
"""Artifact with tags for package artifact listing"""
"""Artifact for package artifact listing"""
id: str
sha256: str # Explicit SHA256 field (same as id)
@@ -350,7 +256,7 @@ class PackageArtifactResponse(BaseModel):
created_at: datetime
created_by: str
format_metadata: Optional[Dict[str, Any]] = None
tags: List[str] = [] # Tag names pointing to this artifact
version: Optional[str] = None # Version from PackageVersion if exists
class Config:
from_attributes = True
@@ -368,28 +274,9 @@ class GlobalArtifactResponse(BaseModel):
created_by: str
format_metadata: Optional[Dict[str, Any]] = None
ref_count: int = 0
# Context from tags/packages
# Context from versions/packages
projects: List[str] = [] # List of project names containing this artifact
packages: List[str] = [] # List of "project/package" paths
tags: List[str] = [] # List of "project/package:tag" references
class Config:
from_attributes = True
class GlobalTagResponse(BaseModel):
"""Tag with project/package context for global listing"""
id: UUID
name: str
artifact_id: str
created_at: datetime
created_by: str
project_name: str
package_name: str
artifact_size: Optional[int] = None
artifact_content_type: Optional[str] = None
version: Optional[str] = None # Version of the artifact this tag points to
class Config:
from_attributes = True
@@ -402,7 +289,6 @@ class UploadResponse(BaseModel):
size: int
project: str
package: str
tag: Optional[str]
version: Optional[str] = None # Version assigned to this artifact
version_source: Optional[str] = None # How version was determined: 'explicit', 'filename', 'metadata'
checksum_md5: Optional[str] = None
@@ -429,7 +315,6 @@ class ResumableUploadInitRequest(BaseModel):
filename: str
content_type: Optional[str] = None
size: int
tag: Optional[str] = None
version: Optional[str] = None # Explicit version (auto-detected if not provided)
@field_validator("expected_hash")
@@ -464,7 +349,7 @@ class ResumableUploadPartResponse(BaseModel):
class ResumableUploadCompleteRequest(BaseModel):
"""Request to complete a resumable upload"""
tag: Optional[str] = None
pass
class ResumableUploadCompleteResponse(BaseModel):
@@ -474,7 +359,6 @@ class ResumableUploadCompleteResponse(BaseModel):
size: int
project: str
package: str
tag: Optional[str]
class ResumableUploadStatusResponse(BaseModel):
@@ -527,7 +411,6 @@ class PackageVersionResponse(BaseModel):
size: Optional[int] = None
content_type: Optional[str] = None
original_name: Optional[str] = None
tags: List[str] = [] # Tag names pointing to this artifact
class Config:
from_attributes = True
@@ -569,11 +452,10 @@ class SearchResultPackage(BaseModel):
class SearchResultArtifact(BaseModel):
"""Artifact/tag result for global search"""
"""Artifact result for global search"""
tag_id: UUID
tag_name: str
artifact_id: str
version: Optional[str]
package_id: UUID
package_name: str
project_name: str
@@ -611,6 +493,8 @@ class HealthResponse(BaseModel):
version: str = "1.0.0"
storage_healthy: Optional[bool] = None
database_healthy: Optional[bool] = None
http_pool: Optional[Dict[str, Any]] = None
cache: Optional[Dict[str, Any]] = None
# Garbage collection schemas
@@ -686,7 +570,7 @@ class ProjectStatsResponse(BaseModel):
project_id: str
project_name: str
package_count: int
tag_count: int
version_count: int
artifact_count: int
total_size_bytes: int
upload_count: int
@@ -701,7 +585,7 @@ class PackageStatsResponse(BaseModel):
package_id: str
package_name: str
project_name: str
tag_count: int
version_count: int
artifact_count: int
total_size_bytes: int
upload_count: int
@@ -718,9 +602,9 @@ class ArtifactStatsResponse(BaseModel):
size: int
ref_count: int
storage_savings: int # (ref_count - 1) * size
tags: List[Dict[str, Any]] # Tags referencing this artifact
projects: List[str] # Projects using this artifact
packages: List[str] # Packages using this artifact
versions: List[Dict[str, Any]] = [] # List of {version, package_name, project_name}
first_uploaded: Optional[datetime] = None
last_referenced: Optional[datetime] = None
@@ -929,20 +813,7 @@ class DependencyCreate(BaseModel):
"""Schema for creating a dependency"""
project: str
package: str
version: Optional[str] = None
tag: Optional[str] = None
@field_validator('version', 'tag')
@classmethod
def validate_constraint(cls, v, info):
return v
def model_post_init(self, __context):
"""Validate that exactly one of version or tag is set"""
if self.version is None and self.tag is None:
raise ValueError("Either 'version' or 'tag' must be specified")
if self.version is not None and self.tag is not None:
raise ValueError("Cannot specify both 'version' and 'tag'")
version: str
class DependencyResponse(BaseModel):
@@ -951,8 +822,7 @@ class DependencyResponse(BaseModel):
artifact_id: str
project: str
package: str
version: Optional[str] = None
tag: Optional[str] = None
version: str
created_at: datetime
class Config:
@@ -967,7 +837,6 @@ class DependencyResponse(BaseModel):
project=dep.dependency_project,
package=dep.dependency_package,
version=dep.version_constraint,
tag=dep.tag_constraint,
created_at=dep.created_at,
)
@@ -984,7 +853,6 @@ class DependentInfo(BaseModel):
project: str
package: str
version: Optional[str] = None
constraint_type: str # 'version' or 'tag'
constraint_value: str
@@ -1000,20 +868,7 @@ class EnsureFileDependency(BaseModel):
"""Dependency entry from orchard.ensure file"""
project: str
package: str
version: Optional[str] = None
tag: Optional[str] = None
@field_validator('version', 'tag')
@classmethod
def validate_constraint(cls, v, info):
return v
def model_post_init(self, __context):
"""Validate that exactly one of version or tag is set"""
if self.version is None and self.tag is None:
raise ValueError("Either 'version' or 'tag' must be specified")
if self.version is not None and self.tag is not None:
raise ValueError("Cannot specify both 'version' and 'tag'")
version: str
class EnsureFileContent(BaseModel):
@@ -1027,15 +882,26 @@ class ResolvedArtifact(BaseModel):
project: str
package: str
version: Optional[str] = None
tag: Optional[str] = None
size: int
download_url: str
class MissingDependency(BaseModel):
"""A dependency that could not be resolved (not cached on server)"""
project: str
package: str
constraint: Optional[str] = None
required_by: Optional[str] = None
fetch_attempted: bool = False # True if auto-fetch was attempted
fetch_error: Optional[str] = None # Error message if fetch failed
class DependencyResolutionResponse(BaseModel):
"""Response from dependency resolution endpoint"""
requested: Dict[str, str] # project, package, ref
resolved: List[ResolvedArtifact]
missing: List[MissingDependency] = []
fetched: List[ResolvedArtifact] = [] # Artifacts fetched from upstream during resolution
total_size: int
artifact_count: int
@@ -1044,7 +910,7 @@ class DependencyConflict(BaseModel):
"""Details about a dependency conflict"""
project: str
package: str
requirements: List[Dict[str, Any]] # version/tag and required_by info
requirements: List[Dict[str, Any]] # version and required_by info
class DependencyConflictError(BaseModel):
@@ -1378,10 +1244,10 @@ class CacheRequest(BaseModel):
url: str
source_type: str
package_name: Optional[str] = None # Auto-derived from URL if not provided
tag: Optional[str] = None # Auto-derived from URL if not provided
version: Optional[str] = None # Auto-derived from URL if not provided
user_project: Optional[str] = None # Cross-reference to user project
user_package: Optional[str] = None
user_tag: Optional[str] = None
user_version: Optional[str] = None
expected_hash: Optional[str] = None # Verify downloaded content
@field_validator('url')
@@ -1428,8 +1294,8 @@ class CacheResponse(BaseModel):
source_name: Optional[str]
system_project: str
system_package: str
system_tag: Optional[str]
user_reference: Optional[str] = None # e.g., "my-app/npm-deps:lodash-4.17.21"
system_version: Optional[str]
user_reference: Optional[str] = None # e.g., "my-app/npm-deps/+/4.17.21"
class CacheResolveRequest(BaseModel):
@@ -1443,7 +1309,7 @@ class CacheResolveRequest(BaseModel):
version: str
user_project: Optional[str] = None
user_package: Optional[str] = None
user_tag: Optional[str] = None
user_version: Optional[str] = None
@field_validator('source_type')
@classmethod

View File

@@ -5,7 +5,7 @@ import hashlib
import logging
from sqlalchemy.orm import Session
from .models import Project, Package, Artifact, Tag, Upload, PackageVersion, ArtifactDependency, Team, TeamMembership, User
from .models import Project, Package, Artifact, Upload, PackageVersion, ArtifactDependency, Team, TeamMembership, User
from .storage import get_storage
from .auth import hash_password
@@ -125,14 +125,14 @@ TEST_ARTIFACTS = [
]
# Dependencies to create (source artifact -> dependency)
# Format: (source_project, source_package, source_version, dep_project, dep_package, version_constraint, tag_constraint)
# Format: (source_project, source_package, source_version, dep_project, dep_package, version_constraint)
TEST_DEPENDENCIES = [
# ui-components v1.1.0 depends on design-tokens v1.0.0
("frontend-libs", "ui-components", "1.1.0", "frontend-libs", "design-tokens", "1.0.0", None),
("frontend-libs", "ui-components", "1.1.0", "frontend-libs", "design-tokens", "1.0.0"),
# auth-lib v1.0.0 depends on common-utils v2.0.0
("backend-services", "auth-lib", "1.0.0", "backend-services", "common-utils", "2.0.0", None),
# auth-lib v1.0.0 also depends on design-tokens (stable tag)
("backend-services", "auth-lib", "1.0.0", "frontend-libs", "design-tokens", None, "latest"),
("backend-services", "auth-lib", "1.0.0", "backend-services", "common-utils", "2.0.0"),
# auth-lib v1.0.0 also depends on design-tokens v1.0.0
("backend-services", "auth-lib", "1.0.0", "frontend-libs", "design-tokens", "1.0.0"),
]
@@ -252,9 +252,8 @@ def seed_database(db: Session) -> None:
logger.info(f"Created {len(project_map)} projects and {len(package_map)} packages (assigned to {demo_team.slug})")
# Create artifacts, tags, and versions
# Create artifacts and versions
artifact_count = 0
tag_count = 0
version_count = 0
for artifact_data in TEST_ARTIFACTS:
@@ -316,23 +315,12 @@ def seed_database(db: Session) -> None:
db.add(version)
version_count += 1
# Create tags
for tag_name in artifact_data["tags"]:
tag = Tag(
package_id=package.id,
name=tag_name,
artifact_id=sha256_hash,
created_by=team_owner_username,
)
db.add(tag)
tag_count += 1
db.flush()
# Create dependencies
dependency_count = 0
for dep_data in TEST_DEPENDENCIES:
src_project, src_package, src_version, dep_project, dep_package, version_constraint, tag_constraint = dep_data
src_project, src_package, src_version, dep_project, dep_package, version_constraint = dep_data
# Find the source artifact by looking up its version
src_pkg = package_map.get((src_project, src_package))
@@ -356,11 +344,10 @@ def seed_database(db: Session) -> None:
dependency_project=dep_project,
dependency_package=dep_package,
version_constraint=version_constraint,
tag_constraint=tag_constraint,
)
db.add(dependency)
dependency_count += 1
db.commit()
logger.info(f"Created {artifact_count} artifacts, {tag_count} tags, {version_count} versions, and {dependency_count} dependencies")
logger.info(f"Created {artifact_count} artifacts, {version_count} versions, and {dependency_count} dependencies")
logger.info("Database seeding complete")

View File

@@ -6,9 +6,8 @@ from typing import List, Optional, Tuple
from sqlalchemy.orm import Session
import logging
from ..models import Artifact, Tag
from ..models import Artifact, PackageVersion
from ..repositories.artifact import ArtifactRepository
from ..repositories.tag import TagRepository
from ..storage import S3Storage
logger = logging.getLogger(__name__)
@@ -21,8 +20,8 @@ class ArtifactCleanupService:
Reference counting rules:
- ref_count starts at 1 when artifact is first uploaded
- ref_count increments when the same artifact is uploaded again (deduplication)
- ref_count decrements when a tag is deleted or updated to point elsewhere
- ref_count decrements when a package is deleted (for each tag pointing to artifact)
- ref_count decrements when a version is deleted or updated to point elsewhere
- ref_count decrements when a package is deleted (for each version pointing to artifact)
- When ref_count reaches 0, artifact is a candidate for deletion from S3
"""
@@ -30,12 +29,11 @@ class ArtifactCleanupService:
self.db = db
self.storage = storage
self.artifact_repo = ArtifactRepository(db)
self.tag_repo = TagRepository(db)
def on_tag_deleted(self, artifact_id: str) -> Artifact:
def on_version_deleted(self, artifact_id: str) -> Artifact:
"""
Called when a tag is deleted.
Decrements ref_count for the artifact the tag was pointing to.
Called when a version is deleted.
Decrements ref_count for the artifact the version was pointing to.
"""
artifact = self.artifact_repo.get_by_sha256(artifact_id)
if artifact:
@@ -45,11 +43,11 @@ class ArtifactCleanupService:
)
return artifact
def on_tag_updated(
def on_version_updated(
self, old_artifact_id: str, new_artifact_id: str
) -> Tuple[Optional[Artifact], Optional[Artifact]]:
"""
Called when a tag is updated to point to a different artifact.
Called when a version is updated to point to a different artifact.
Decrements ref_count for old artifact, increments for new (if different).
Returns (old_artifact, new_artifact) tuple.
@@ -79,21 +77,21 @@ class ArtifactCleanupService:
def on_package_deleted(self, package_id) -> List[str]:
"""
Called when a package is deleted.
Decrements ref_count for all artifacts that had tags in the package.
Decrements ref_count for all artifacts that had versions in the package.
Returns list of artifact IDs that were affected.
"""
# Get all tags in the package before deletion
tags = self.db.query(Tag).filter(Tag.package_id == package_id).all()
# Get all versions in the package before deletion
versions = self.db.query(PackageVersion).filter(PackageVersion.package_id == package_id).all()
affected_artifacts = []
for tag in tags:
artifact = self.artifact_repo.get_by_sha256(tag.artifact_id)
for version in versions:
artifact = self.artifact_repo.get_by_sha256(version.artifact_id)
if artifact:
self.artifact_repo.decrement_ref_count(artifact)
affected_artifacts.append(tag.artifact_id)
affected_artifacts.append(version.artifact_id)
logger.info(
f"Decremented ref_count for artifact {tag.artifact_id} (package delete)"
f"Decremented ref_count for artifact {version.artifact_id} (package delete)"
)
return affected_artifacts
@@ -152,7 +150,7 @@ class ArtifactCleanupService:
def verify_ref_counts(self, fix: bool = False) -> List[dict]:
"""
Verify that ref_counts match actual tag references.
Verify that ref_counts match actual version references.
Args:
fix: If True, fix any mismatched ref_counts
@@ -162,28 +160,28 @@ class ArtifactCleanupService:
"""
from sqlalchemy import func
# Get actual tag counts per artifact
tag_counts = (
self.db.query(Tag.artifact_id, func.count(Tag.id).label("tag_count"))
.group_by(Tag.artifact_id)
# Get actual version counts per artifact
version_counts = (
self.db.query(PackageVersion.artifact_id, func.count(PackageVersion.id).label("version_count"))
.group_by(PackageVersion.artifact_id)
.all()
)
tag_count_map = {artifact_id: count for artifact_id, count in tag_counts}
version_count_map = {artifact_id: count for artifact_id, count in version_counts}
# Check all artifacts
artifacts = self.db.query(Artifact).all()
mismatches = []
for artifact in artifacts:
actual_count = tag_count_map.get(artifact.id, 0)
actual_count = version_count_map.get(artifact.id, 0)
# ref_count should be at least 1 (initial upload) + additional uploads
# But tags are the primary reference, so we check against tag count
# But versions are the primary reference, so we check against version count
if artifact.ref_count < actual_count:
mismatch = {
"artifact_id": artifact.id,
"stored_ref_count": artifact.ref_count,
"actual_tag_count": actual_count,
"actual_version_count": actual_count,
}
mismatches.append(mismatch)