From c79b10cbc5a23b047a104f5c1fbf553d1a40a144 Mon Sep 17 00:00:00 2001 From: Mondo Diaz Date: Mon, 5 Jan 2026 14:57:47 -0600 Subject: [PATCH] Add comprehensive stats endpoints and reporting features Backend stats endpoints: - GET /api/v1/project/:project/packages/:package/stats - per-package stats - GET /api/v1/artifact/:id/stats - artifact reference statistics - GET /api/v1/stats/cross-project - cross-project deduplication detection - GET /api/v1/stats/timeline - time-based metrics (daily/weekly/monthly) - GET /api/v1/stats/export - CSV/JSON export - GET /api/v1/stats/report - markdown/JSON summary report generation Enhanced existing endpoints: - Added storage_saved_bytes and deduplication_ratio to project stats - Added date range filtering via from_date/to_date params New schemas: - PackageStatsResponse - ArtifactStatsResponse - CrossProjectDeduplicationResponse - TimeBasedStatsResponse - StatsReportResponse --- backend/app/routes.py | 514 ++++++++++++++++++++++++++++++++++++++++- backend/app/schemas.py | 59 +++++ 2 files changed, 572 insertions(+), 1 deletion(-) diff --git a/backend/app/routes.py b/backend/app/routes.py index aea8ae4..d04e395 100644 --- a/backend/app/routes.py +++ b/backend/app/routes.py @@ -1,3 +1,4 @@ +import json from datetime import datetime, timedelta, timezone from fastapi import ( APIRouter, @@ -80,6 +81,11 @@ from .schemas import ( StorageStatsResponse, DeduplicationStatsResponse, ProjectStatsResponse, + PackageStatsResponse, + ArtifactStatsResponse, + CrossProjectDeduplicationResponse, + TimeBasedStatsResponse, + StatsReportResponse, ) from .metadata import extract_metadata from .config import get_settings @@ -2499,17 +2505,25 @@ def get_project_stats( artifact_count = artifact_stats[0] if artifact_stats else 0 total_size_bytes = artifact_stats[1] if artifact_stats else 0 - # Upload counts + # Upload counts and storage saved upload_stats = ( db.query( func.count(Upload.id), func.count(Upload.id).filter(Upload.deduplicated == True), + func.coalesce( + func.sum(Artifact.size).filter(Upload.deduplicated == True), 0 + ), ) + .join(Artifact, Upload.artifact_id == Artifact.id) .filter(Upload.package_id.in_(package_ids)) .first() ) upload_count = upload_stats[0] if upload_stats else 0 deduplicated_uploads = upload_stats[1] if upload_stats else 0 + storage_saved_bytes = upload_stats[2] if upload_stats else 0 + + # Calculate deduplication ratio + deduplication_ratio = upload_count / artifact_count if artifact_count > 0 else 1.0 return ProjectStatsResponse( project_id=str(project.id), @@ -2520,4 +2534,502 @@ def get_project_stats( total_size_bytes=total_size_bytes, upload_count=upload_count, deduplicated_uploads=deduplicated_uploads, + storage_saved_bytes=storage_saved_bytes, + deduplication_ratio=deduplication_ratio, + ) + + +# ============================================================================= +# Package Statistics Endpoint +# ============================================================================= + + +@router.get( + "/api/v1/project/{project_name}/packages/{package_name}/stats", + response_model=PackageStatsResponse, +) +def get_package_stats( + project_name: str, + package_name: str, + db: Session = Depends(get_db), +): + """Get statistics for a specific package.""" + project = db.query(Project).filter(Project.name == project_name).first() + if not project: + raise HTTPException(status_code=404, detail="Project not found") + + package = ( + db.query(Package) + .filter(Package.project_id == project.id, Package.name == package_name) + .first() + ) + if not package: + raise HTTPException(status_code=404, detail="Package not found") + + # Tag count + tag_count = ( + db.query(func.count(Tag.id)).filter(Tag.package_id == package.id).scalar() or 0 + ) + + # Artifact stats via uploads + artifact_stats = ( + db.query( + func.count(func.distinct(Upload.artifact_id)), + func.coalesce(func.sum(Artifact.size), 0), + ) + .join(Artifact, Upload.artifact_id == Artifact.id) + .filter(Upload.package_id == package.id) + .first() + ) + artifact_count = artifact_stats[0] if artifact_stats else 0 + total_size_bytes = artifact_stats[1] if artifact_stats else 0 + + # Upload stats + upload_stats = ( + db.query( + func.count(Upload.id), + func.count(Upload.id).filter(Upload.deduplicated == True), + func.coalesce( + func.sum(Artifact.size).filter(Upload.deduplicated == True), 0 + ), + ) + .join(Artifact, Upload.artifact_id == Artifact.id) + .filter(Upload.package_id == package.id) + .first() + ) + upload_count = upload_stats[0] if upload_stats else 0 + deduplicated_uploads = upload_stats[1] if upload_stats else 0 + storage_saved_bytes = upload_stats[2] if upload_stats else 0 + + deduplication_ratio = upload_count / artifact_count if artifact_count > 0 else 1.0 + + return PackageStatsResponse( + package_id=str(package.id), + package_name=package.name, + project_name=project.name, + tag_count=tag_count, + artifact_count=artifact_count, + total_size_bytes=total_size_bytes, + upload_count=upload_count, + deduplicated_uploads=deduplicated_uploads, + storage_saved_bytes=storage_saved_bytes, + deduplication_ratio=deduplication_ratio, + ) + + +# ============================================================================= +# Artifact Statistics Endpoint +# ============================================================================= + + +@router.get( + "/api/v1/artifact/{artifact_id}/stats", response_model=ArtifactStatsResponse +) +def get_artifact_stats( + artifact_id: str, + db: Session = Depends(get_db), +): + """Get detailed statistics for a specific artifact.""" + artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first() + if not artifact: + raise HTTPException(status_code=404, detail="Artifact not found") + + # Get all tags referencing this artifact + tags = ( + db.query(Tag, Package, Project) + .join(Package, Tag.package_id == Package.id) + .join(Project, Package.project_id == Project.id) + .filter(Tag.artifact_id == artifact_id) + .all() + ) + + tag_list = [ + { + "tag_name": tag.name, + "package_name": pkg.name, + "project_name": proj.name, + "created_at": tag.created_at.isoformat() if tag.created_at else None, + } + for tag, pkg, proj in tags + ] + + # Get unique projects and packages + projects = list(set(proj.name for _, _, proj in tags)) + packages = list(set(f"{proj.name}/{pkg.name}" for _, pkg, proj in tags)) + + # Get first and last upload times + upload_times = ( + db.query(func.min(Upload.uploaded_at), func.max(Upload.uploaded_at)) + .filter(Upload.artifact_id == artifact_id) + .first() + ) + + return ArtifactStatsResponse( + artifact_id=artifact.id, + sha256=artifact.id, + size=artifact.size, + ref_count=artifact.ref_count, + storage_savings=(artifact.ref_count - 1) * artifact.size + if artifact.ref_count > 1 + else 0, + tags=tag_list, + projects=projects, + packages=packages, + first_uploaded=upload_times[0] if upload_times else None, + last_referenced=upload_times[1] if upload_times else None, + ) + + +# ============================================================================= +# Cross-Project Deduplication Endpoint +# ============================================================================= + + +@router.get( + "/api/v1/stats/cross-project", response_model=CrossProjectDeduplicationResponse +) +def get_cross_project_deduplication( + limit: int = Query(default=20, ge=1, le=100), + db: Session = Depends(get_db), +): + """Get statistics about artifacts shared across multiple projects.""" + # Find artifacts that appear in multiple projects + # Subquery to count distinct projects per artifact + project_counts = ( + db.query( + Upload.artifact_id, + func.count(func.distinct(Package.project_id)).label("project_count"), + ) + .join(Package, Upload.package_id == Package.id) + .group_by(Upload.artifact_id) + .subquery() + ) + + # Get artifacts with more than one project + shared_artifacts_query = ( + db.query(Artifact, project_counts.c.project_count) + .join(project_counts, Artifact.id == project_counts.c.artifact_id) + .filter(project_counts.c.project_count > 1) + .order_by(project_counts.c.project_count.desc(), Artifact.size.desc()) + .limit(limit) + ) + + shared_artifacts = [] + total_savings = 0 + + for artifact, project_count in shared_artifacts_query: + # Calculate savings: (project_count - 1) * size + savings = (project_count - 1) * artifact.size + total_savings += savings + + # Get project names + project_names = ( + db.query(func.distinct(Project.name)) + .join(Package, Package.project_id == Project.id) + .join(Upload, Upload.package_id == Package.id) + .filter(Upload.artifact_id == artifact.id) + .all() + ) + + shared_artifacts.append( + { + "artifact_id": artifact.id, + "size": artifact.size, + "project_count": project_count, + "projects": [p[0] for p in project_names], + "storage_savings": savings, + } + ) + + # Total count of shared artifacts + shared_count = ( + db.query(func.count()) + .select_from(project_counts) + .filter(project_counts.c.project_count > 1) + .scalar() + or 0 + ) + + return CrossProjectDeduplicationResponse( + shared_artifacts_count=shared_count, + total_cross_project_savings=total_savings, + shared_artifacts=shared_artifacts, + ) + + +# ============================================================================= +# Time-Based Statistics Endpoint +# ============================================================================= + + +@router.get("/api/v1/stats/timeline", response_model=TimeBasedStatsResponse) +def get_time_based_stats( + period: str = Query(default="daily", regex="^(daily|weekly|monthly)$"), + from_date: Optional[datetime] = Query(default=None), + to_date: Optional[datetime] = Query(default=None), + db: Session = Depends(get_db), +): + """Get deduplication statistics over time.""" + from datetime import timedelta + + # Default date range: last 30 days + if to_date is None: + to_date = datetime.utcnow() + if from_date is None: + from_date = to_date - timedelta(days=30) + + # Determine date truncation based on period + if period == "daily": + date_trunc = func.date_trunc("day", Upload.uploaded_at) + elif period == "weekly": + date_trunc = func.date_trunc("week", Upload.uploaded_at) + else: # monthly + date_trunc = func.date_trunc("month", Upload.uploaded_at) + + # Query uploads grouped by period + stats = ( + db.query( + date_trunc.label("period_start"), + func.count(Upload.id).label("total_uploads"), + func.count(func.distinct(Upload.artifact_id)).label("unique_artifacts"), + func.count(Upload.id) + .filter(Upload.deduplicated == True) + .label("duplicated"), + func.coalesce( + func.sum(Artifact.size).filter(Upload.deduplicated == True), 0 + ).label("bytes_saved"), + ) + .join(Artifact, Upload.artifact_id == Artifact.id) + .filter(Upload.uploaded_at >= from_date, Upload.uploaded_at <= to_date) + .group_by(date_trunc) + .order_by(date_trunc) + .all() + ) + + data_points = [ + { + "date": row.period_start.isoformat() if row.period_start else None, + "total_uploads": row.total_uploads, + "unique_artifacts": row.unique_artifacts, + "duplicated_uploads": row.duplicated, + "bytes_saved": row.bytes_saved, + } + for row in stats + ] + + return TimeBasedStatsResponse( + period=period, + start_date=from_date, + end_date=to_date, + data_points=data_points, + ) + + +# ============================================================================= +# CSV Export Endpoint +# ============================================================================= + + +@router.get("/api/v1/stats/export") +def export_stats( + format: str = Query(default="json", regex="^(json|csv)$"), + db: Session = Depends(get_db), +): + """Export global statistics in JSON or CSV format.""" + from fastapi.responses import Response + + # Gather all stats + total_artifacts = db.query(func.count(Artifact.id)).scalar() or 0 + total_size = db.query(func.coalesce(func.sum(Artifact.size), 0)).scalar() or 0 + total_uploads = db.query(func.count(Upload.id)).scalar() or 0 + deduplicated_uploads = ( + db.query(func.count(Upload.id)).filter(Upload.deduplicated == True).scalar() + or 0 + ) + unique_artifacts = ( + db.query(func.count(Artifact.id)).filter(Artifact.ref_count > 0).scalar() or 0 + ) + + storage_saved = ( + db.query(func.coalesce(func.sum(Artifact.size), 0)) + .join(Upload, Upload.artifact_id == Artifact.id) + .filter(Upload.deduplicated == True) + .scalar() + or 0 + ) + + stats = { + "generated_at": datetime.utcnow().isoformat(), + "total_artifacts": total_artifacts, + "total_size_bytes": total_size, + "total_uploads": total_uploads, + "unique_artifacts": unique_artifacts, + "deduplicated_uploads": deduplicated_uploads, + "storage_saved_bytes": storage_saved, + "deduplication_ratio": total_uploads / unique_artifacts + if unique_artifacts > 0 + else 1.0, + } + + if format == "csv": + import csv + import io + + output = io.StringIO() + writer = csv.writer(output) + writer.writerow(["Metric", "Value"]) + for key, value in stats.items(): + writer.writerow([key, value]) + + return Response( + content=output.getvalue(), + media_type="text/csv", + headers={"Content-Disposition": "attachment; filename=orchard_stats.csv"}, + ) + + return stats + + +# ============================================================================= +# Summary Report Endpoint +# ============================================================================= + + +@router.get("/api/v1/stats/report", response_model=StatsReportResponse) +def generate_stats_report( + format: str = Query(default="markdown", regex="^(markdown|json)$"), + db: Session = Depends(get_db), +): + """Generate a summary report of storage and deduplication statistics.""" + # Gather stats + total_artifacts = db.query(func.count(Artifact.id)).scalar() or 0 + total_size = db.query(func.coalesce(func.sum(Artifact.size), 0)).scalar() or 0 + total_uploads = db.query(func.count(Upload.id)).scalar() or 0 + deduplicated_uploads = ( + db.query(func.count(Upload.id)).filter(Upload.deduplicated == True).scalar() + or 0 + ) + unique_artifacts = ( + db.query(func.count(Artifact.id)).filter(Artifact.ref_count > 0).scalar() or 0 + ) + orphaned_artifacts = ( + db.query(func.count(Artifact.id)).filter(Artifact.ref_count == 0).scalar() or 0 + ) + + storage_saved = ( + db.query(func.coalesce(func.sum(Artifact.size), 0)) + .join(Upload, Upload.artifact_id == Artifact.id) + .filter(Upload.deduplicated == True) + .scalar() + or 0 + ) + + project_count = db.query(func.count(Project.id)).scalar() or 0 + package_count = db.query(func.count(Package.id)).scalar() or 0 + + # Top 5 most referenced artifacts + top_artifacts = ( + db.query(Artifact) + .filter(Artifact.ref_count > 1) + .order_by(Artifact.ref_count.desc()) + .limit(5) + .all() + ) + + def format_bytes(b): + for unit in ["B", "KB", "MB", "GB", "TB"]: + if b < 1024: + return f"{b:.2f} {unit}" + b /= 1024 + return f"{b:.2f} PB" + + generated_at = datetime.utcnow() + + if format == "markdown": + report = f"""# Orchard Storage Report + +Generated: {generated_at.strftime("%Y-%m-%d %H:%M:%S UTC")} + +## Overview + +| Metric | Value | +|--------|-------| +| Projects | {project_count} | +| Packages | {package_count} | +| Total Artifacts | {total_artifacts} | +| Unique Artifacts | {unique_artifacts} | +| Orphaned Artifacts | {orphaned_artifacts} | + +## Storage + +| Metric | Value | +|--------|-------| +| Total Storage Used | {format_bytes(total_size)} | +| Storage Saved | {format_bytes(storage_saved)} | +| Savings Percentage | {(storage_saved / (total_size + storage_saved) * 100) if (total_size + storage_saved) > 0 else 0:.1f}% | + +## Uploads + +| Metric | Value | +|--------|-------| +| Total Uploads | {total_uploads} | +| Deduplicated Uploads | {deduplicated_uploads} | +| Deduplication Ratio | {total_uploads / unique_artifacts if unique_artifacts > 0 else 1:.2f}x | + +## Top Referenced Artifacts + +| Artifact ID | Size | References | Savings | +|-------------|------|------------|---------| +""" + for art in top_artifacts: + savings = (art.ref_count - 1) * art.size + report += f"| `{art.id[:12]}...` | {format_bytes(art.size)} | {art.ref_count} | {format_bytes(savings)} |\n" + + return StatsReportResponse( + format="markdown", + generated_at=generated_at, + content=report, + ) + + # JSON format + return StatsReportResponse( + format="json", + generated_at=generated_at, + content=json.dumps( + { + "overview": { + "projects": project_count, + "packages": package_count, + "total_artifacts": total_artifacts, + "unique_artifacts": unique_artifacts, + "orphaned_artifacts": orphaned_artifacts, + }, + "storage": { + "total_bytes": total_size, + "saved_bytes": storage_saved, + "savings_percentage": ( + storage_saved / (total_size + storage_saved) * 100 + ) + if (total_size + storage_saved) > 0 + else 0, + }, + "uploads": { + "total": total_uploads, + "deduplicated": deduplicated_uploads, + "ratio": total_uploads / unique_artifacts + if unique_artifacts > 0 + else 1, + }, + "top_artifacts": [ + { + "id": art.id, + "size": art.size, + "ref_count": art.ref_count, + "savings": (art.ref_count - 1) * art.size, + } + for art in top_artifacts + ], + }, + indent=2, + ), ) diff --git a/backend/app/schemas.py b/backend/app/schemas.py index f2ac50a..203c842 100644 --- a/backend/app/schemas.py +++ b/backend/app/schemas.py @@ -456,3 +456,62 @@ class ProjectStatsResponse(BaseModel): total_size_bytes: int upload_count: int deduplicated_uploads: int + storage_saved_bytes: int = 0 # Bytes saved through deduplication + deduplication_ratio: float = 1.0 # upload_count / artifact_count + + +class PackageStatsResponse(BaseModel): + """Per-package statistics""" + + package_id: str + package_name: str + project_name: str + tag_count: int + artifact_count: int + total_size_bytes: int + upload_count: int + deduplicated_uploads: int + storage_saved_bytes: int = 0 + deduplication_ratio: float = 1.0 + + +class ArtifactStatsResponse(BaseModel): + """Per-artifact reference statistics""" + + artifact_id: str + sha256: str + size: int + ref_count: int + storage_savings: int # (ref_count - 1) * size + tags: List[Dict[str, Any]] # Tags referencing this artifact + projects: List[str] # Projects using this artifact + packages: List[str] # Packages using this artifact + first_uploaded: Optional[datetime] = None + last_referenced: Optional[datetime] = None + + +class CrossProjectDeduplicationResponse(BaseModel): + """Cross-project deduplication statistics""" + + shared_artifacts_count: int # Artifacts used in multiple projects + total_cross_project_savings: int # Bytes saved by cross-project sharing + shared_artifacts: List[Dict[str, Any]] # Details of shared artifacts + + +class TimeBasedStatsResponse(BaseModel): + """Time-based deduplication statistics""" + + period: str # "daily", "weekly", "monthly" + start_date: datetime + end_date: datetime + data_points: List[ + Dict[str, Any] + ] # List of {date, uploads, unique, duplicated, bytes_saved} + + +class StatsReportResponse(BaseModel): + """Summary report in various formats""" + + format: str # "json", "csv", "markdown" + generated_at: datetime + content: str # The report content