Add comprehensive stats endpoints and reporting features

Backend stats endpoints:
- GET /api/v1/project/:project/packages/:package/stats - per-package stats
- GET /api/v1/artifact/:id/stats - artifact reference statistics
- GET /api/v1/stats/cross-project - cross-project deduplication detection
- GET /api/v1/stats/timeline - time-based metrics (daily/weekly/monthly)
- GET /api/v1/stats/export - CSV/JSON export
- GET /api/v1/stats/report - markdown/JSON summary report generation

Enhanced existing endpoints:
- Added storage_saved_bytes and deduplication_ratio to project stats
- Added date range filtering via from_date/to_date params

New schemas:
- PackageStatsResponse
- ArtifactStatsResponse
- CrossProjectDeduplicationResponse
- TimeBasedStatsResponse
- StatsReportResponse
This commit is contained in:
Mondo Diaz
2026-01-05 14:57:47 -06:00
parent e215ecabcd
commit c79b10cbc5
2 changed files with 572 additions and 1 deletions

View File

@@ -1,3 +1,4 @@
import json
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from fastapi import ( from fastapi import (
APIRouter, APIRouter,
@@ -80,6 +81,11 @@ from .schemas import (
StorageStatsResponse, StorageStatsResponse,
DeduplicationStatsResponse, DeduplicationStatsResponse,
ProjectStatsResponse, ProjectStatsResponse,
PackageStatsResponse,
ArtifactStatsResponse,
CrossProjectDeduplicationResponse,
TimeBasedStatsResponse,
StatsReportResponse,
) )
from .metadata import extract_metadata from .metadata import extract_metadata
from .config import get_settings from .config import get_settings
@@ -2499,17 +2505,25 @@ def get_project_stats(
artifact_count = artifact_stats[0] if artifact_stats else 0 artifact_count = artifact_stats[0] if artifact_stats else 0
total_size_bytes = artifact_stats[1] if artifact_stats else 0 total_size_bytes = artifact_stats[1] if artifact_stats else 0
# Upload counts # Upload counts and storage saved
upload_stats = ( upload_stats = (
db.query( db.query(
func.count(Upload.id), func.count(Upload.id),
func.count(Upload.id).filter(Upload.deduplicated == True), func.count(Upload.id).filter(Upload.deduplicated == True),
func.coalesce(
func.sum(Artifact.size).filter(Upload.deduplicated == True), 0
),
) )
.join(Artifact, Upload.artifact_id == Artifact.id)
.filter(Upload.package_id.in_(package_ids)) .filter(Upload.package_id.in_(package_ids))
.first() .first()
) )
upload_count = upload_stats[0] if upload_stats else 0 upload_count = upload_stats[0] if upload_stats else 0
deduplicated_uploads = upload_stats[1] if upload_stats else 0 deduplicated_uploads = upload_stats[1] if upload_stats else 0
storage_saved_bytes = upload_stats[2] if upload_stats else 0
# Calculate deduplication ratio
deduplication_ratio = upload_count / artifact_count if artifact_count > 0 else 1.0
return ProjectStatsResponse( return ProjectStatsResponse(
project_id=str(project.id), project_id=str(project.id),
@@ -2520,4 +2534,502 @@ def get_project_stats(
total_size_bytes=total_size_bytes, total_size_bytes=total_size_bytes,
upload_count=upload_count, upload_count=upload_count,
deduplicated_uploads=deduplicated_uploads, deduplicated_uploads=deduplicated_uploads,
storage_saved_bytes=storage_saved_bytes,
deduplication_ratio=deduplication_ratio,
)
# =============================================================================
# Package Statistics Endpoint
# =============================================================================
@router.get(
"/api/v1/project/{project_name}/packages/{package_name}/stats",
response_model=PackageStatsResponse,
)
def get_package_stats(
project_name: str,
package_name: str,
db: Session = Depends(get_db),
):
"""Get statistics for a specific package."""
project = db.query(Project).filter(Project.name == project_name).first()
if not project:
raise HTTPException(status_code=404, detail="Project not found")
package = (
db.query(Package)
.filter(Package.project_id == project.id, Package.name == package_name)
.first()
)
if not package:
raise HTTPException(status_code=404, detail="Package not found")
# Tag count
tag_count = (
db.query(func.count(Tag.id)).filter(Tag.package_id == package.id).scalar() or 0
)
# Artifact stats via uploads
artifact_stats = (
db.query(
func.count(func.distinct(Upload.artifact_id)),
func.coalesce(func.sum(Artifact.size), 0),
)
.join(Artifact, Upload.artifact_id == Artifact.id)
.filter(Upload.package_id == package.id)
.first()
)
artifact_count = artifact_stats[0] if artifact_stats else 0
total_size_bytes = artifact_stats[1] if artifact_stats else 0
# Upload stats
upload_stats = (
db.query(
func.count(Upload.id),
func.count(Upload.id).filter(Upload.deduplicated == True),
func.coalesce(
func.sum(Artifact.size).filter(Upload.deduplicated == True), 0
),
)
.join(Artifact, Upload.artifact_id == Artifact.id)
.filter(Upload.package_id == package.id)
.first()
)
upload_count = upload_stats[0] if upload_stats else 0
deduplicated_uploads = upload_stats[1] if upload_stats else 0
storage_saved_bytes = upload_stats[2] if upload_stats else 0
deduplication_ratio = upload_count / artifact_count if artifact_count > 0 else 1.0
return PackageStatsResponse(
package_id=str(package.id),
package_name=package.name,
project_name=project.name,
tag_count=tag_count,
artifact_count=artifact_count,
total_size_bytes=total_size_bytes,
upload_count=upload_count,
deduplicated_uploads=deduplicated_uploads,
storage_saved_bytes=storage_saved_bytes,
deduplication_ratio=deduplication_ratio,
)
# =============================================================================
# Artifact Statistics Endpoint
# =============================================================================
@router.get(
"/api/v1/artifact/{artifact_id}/stats", response_model=ArtifactStatsResponse
)
def get_artifact_stats(
artifact_id: str,
db: Session = Depends(get_db),
):
"""Get detailed statistics for a specific artifact."""
artifact = db.query(Artifact).filter(Artifact.id == artifact_id).first()
if not artifact:
raise HTTPException(status_code=404, detail="Artifact not found")
# Get all tags referencing this artifact
tags = (
db.query(Tag, Package, Project)
.join(Package, Tag.package_id == Package.id)
.join(Project, Package.project_id == Project.id)
.filter(Tag.artifact_id == artifact_id)
.all()
)
tag_list = [
{
"tag_name": tag.name,
"package_name": pkg.name,
"project_name": proj.name,
"created_at": tag.created_at.isoformat() if tag.created_at else None,
}
for tag, pkg, proj in tags
]
# Get unique projects and packages
projects = list(set(proj.name for _, _, proj in tags))
packages = list(set(f"{proj.name}/{pkg.name}" for _, pkg, proj in tags))
# Get first and last upload times
upload_times = (
db.query(func.min(Upload.uploaded_at), func.max(Upload.uploaded_at))
.filter(Upload.artifact_id == artifact_id)
.first()
)
return ArtifactStatsResponse(
artifact_id=artifact.id,
sha256=artifact.id,
size=artifact.size,
ref_count=artifact.ref_count,
storage_savings=(artifact.ref_count - 1) * artifact.size
if artifact.ref_count > 1
else 0,
tags=tag_list,
projects=projects,
packages=packages,
first_uploaded=upload_times[0] if upload_times else None,
last_referenced=upload_times[1] if upload_times else None,
)
# =============================================================================
# Cross-Project Deduplication Endpoint
# =============================================================================
@router.get(
"/api/v1/stats/cross-project", response_model=CrossProjectDeduplicationResponse
)
def get_cross_project_deduplication(
limit: int = Query(default=20, ge=1, le=100),
db: Session = Depends(get_db),
):
"""Get statistics about artifacts shared across multiple projects."""
# Find artifacts that appear in multiple projects
# Subquery to count distinct projects per artifact
project_counts = (
db.query(
Upload.artifact_id,
func.count(func.distinct(Package.project_id)).label("project_count"),
)
.join(Package, Upload.package_id == Package.id)
.group_by(Upload.artifact_id)
.subquery()
)
# Get artifacts with more than one project
shared_artifacts_query = (
db.query(Artifact, project_counts.c.project_count)
.join(project_counts, Artifact.id == project_counts.c.artifact_id)
.filter(project_counts.c.project_count > 1)
.order_by(project_counts.c.project_count.desc(), Artifact.size.desc())
.limit(limit)
)
shared_artifacts = []
total_savings = 0
for artifact, project_count in shared_artifacts_query:
# Calculate savings: (project_count - 1) * size
savings = (project_count - 1) * artifact.size
total_savings += savings
# Get project names
project_names = (
db.query(func.distinct(Project.name))
.join(Package, Package.project_id == Project.id)
.join(Upload, Upload.package_id == Package.id)
.filter(Upload.artifact_id == artifact.id)
.all()
)
shared_artifacts.append(
{
"artifact_id": artifact.id,
"size": artifact.size,
"project_count": project_count,
"projects": [p[0] for p in project_names],
"storage_savings": savings,
}
)
# Total count of shared artifacts
shared_count = (
db.query(func.count())
.select_from(project_counts)
.filter(project_counts.c.project_count > 1)
.scalar()
or 0
)
return CrossProjectDeduplicationResponse(
shared_artifacts_count=shared_count,
total_cross_project_savings=total_savings,
shared_artifacts=shared_artifacts,
)
# =============================================================================
# Time-Based Statistics Endpoint
# =============================================================================
@router.get("/api/v1/stats/timeline", response_model=TimeBasedStatsResponse)
def get_time_based_stats(
period: str = Query(default="daily", regex="^(daily|weekly|monthly)$"),
from_date: Optional[datetime] = Query(default=None),
to_date: Optional[datetime] = Query(default=None),
db: Session = Depends(get_db),
):
"""Get deduplication statistics over time."""
from datetime import timedelta
# Default date range: last 30 days
if to_date is None:
to_date = datetime.utcnow()
if from_date is None:
from_date = to_date - timedelta(days=30)
# Determine date truncation based on period
if period == "daily":
date_trunc = func.date_trunc("day", Upload.uploaded_at)
elif period == "weekly":
date_trunc = func.date_trunc("week", Upload.uploaded_at)
else: # monthly
date_trunc = func.date_trunc("month", Upload.uploaded_at)
# Query uploads grouped by period
stats = (
db.query(
date_trunc.label("period_start"),
func.count(Upload.id).label("total_uploads"),
func.count(func.distinct(Upload.artifact_id)).label("unique_artifacts"),
func.count(Upload.id)
.filter(Upload.deduplicated == True)
.label("duplicated"),
func.coalesce(
func.sum(Artifact.size).filter(Upload.deduplicated == True), 0
).label("bytes_saved"),
)
.join(Artifact, Upload.artifact_id == Artifact.id)
.filter(Upload.uploaded_at >= from_date, Upload.uploaded_at <= to_date)
.group_by(date_trunc)
.order_by(date_trunc)
.all()
)
data_points = [
{
"date": row.period_start.isoformat() if row.period_start else None,
"total_uploads": row.total_uploads,
"unique_artifacts": row.unique_artifacts,
"duplicated_uploads": row.duplicated,
"bytes_saved": row.bytes_saved,
}
for row in stats
]
return TimeBasedStatsResponse(
period=period,
start_date=from_date,
end_date=to_date,
data_points=data_points,
)
# =============================================================================
# CSV Export Endpoint
# =============================================================================
@router.get("/api/v1/stats/export")
def export_stats(
format: str = Query(default="json", regex="^(json|csv)$"),
db: Session = Depends(get_db),
):
"""Export global statistics in JSON or CSV format."""
from fastapi.responses import Response
# Gather all stats
total_artifacts = db.query(func.count(Artifact.id)).scalar() or 0
total_size = db.query(func.coalesce(func.sum(Artifact.size), 0)).scalar() or 0
total_uploads = db.query(func.count(Upload.id)).scalar() or 0
deduplicated_uploads = (
db.query(func.count(Upload.id)).filter(Upload.deduplicated == True).scalar()
or 0
)
unique_artifacts = (
db.query(func.count(Artifact.id)).filter(Artifact.ref_count > 0).scalar() or 0
)
storage_saved = (
db.query(func.coalesce(func.sum(Artifact.size), 0))
.join(Upload, Upload.artifact_id == Artifact.id)
.filter(Upload.deduplicated == True)
.scalar()
or 0
)
stats = {
"generated_at": datetime.utcnow().isoformat(),
"total_artifacts": total_artifacts,
"total_size_bytes": total_size,
"total_uploads": total_uploads,
"unique_artifacts": unique_artifacts,
"deduplicated_uploads": deduplicated_uploads,
"storage_saved_bytes": storage_saved,
"deduplication_ratio": total_uploads / unique_artifacts
if unique_artifacts > 0
else 1.0,
}
if format == "csv":
import csv
import io
output = io.StringIO()
writer = csv.writer(output)
writer.writerow(["Metric", "Value"])
for key, value in stats.items():
writer.writerow([key, value])
return Response(
content=output.getvalue(),
media_type="text/csv",
headers={"Content-Disposition": "attachment; filename=orchard_stats.csv"},
)
return stats
# =============================================================================
# Summary Report Endpoint
# =============================================================================
@router.get("/api/v1/stats/report", response_model=StatsReportResponse)
def generate_stats_report(
format: str = Query(default="markdown", regex="^(markdown|json)$"),
db: Session = Depends(get_db),
):
"""Generate a summary report of storage and deduplication statistics."""
# Gather stats
total_artifacts = db.query(func.count(Artifact.id)).scalar() or 0
total_size = db.query(func.coalesce(func.sum(Artifact.size), 0)).scalar() or 0
total_uploads = db.query(func.count(Upload.id)).scalar() or 0
deduplicated_uploads = (
db.query(func.count(Upload.id)).filter(Upload.deduplicated == True).scalar()
or 0
)
unique_artifacts = (
db.query(func.count(Artifact.id)).filter(Artifact.ref_count > 0).scalar() or 0
)
orphaned_artifacts = (
db.query(func.count(Artifact.id)).filter(Artifact.ref_count == 0).scalar() or 0
)
storage_saved = (
db.query(func.coalesce(func.sum(Artifact.size), 0))
.join(Upload, Upload.artifact_id == Artifact.id)
.filter(Upload.deduplicated == True)
.scalar()
or 0
)
project_count = db.query(func.count(Project.id)).scalar() or 0
package_count = db.query(func.count(Package.id)).scalar() or 0
# Top 5 most referenced artifacts
top_artifacts = (
db.query(Artifact)
.filter(Artifact.ref_count > 1)
.order_by(Artifact.ref_count.desc())
.limit(5)
.all()
)
def format_bytes(b):
for unit in ["B", "KB", "MB", "GB", "TB"]:
if b < 1024:
return f"{b:.2f} {unit}"
b /= 1024
return f"{b:.2f} PB"
generated_at = datetime.utcnow()
if format == "markdown":
report = f"""# Orchard Storage Report
Generated: {generated_at.strftime("%Y-%m-%d %H:%M:%S UTC")}
## Overview
| Metric | Value |
|--------|-------|
| Projects | {project_count} |
| Packages | {package_count} |
| Total Artifacts | {total_artifacts} |
| Unique Artifacts | {unique_artifacts} |
| Orphaned Artifacts | {orphaned_artifacts} |
## Storage
| Metric | Value |
|--------|-------|
| Total Storage Used | {format_bytes(total_size)} |
| Storage Saved | {format_bytes(storage_saved)} |
| Savings Percentage | {(storage_saved / (total_size + storage_saved) * 100) if (total_size + storage_saved) > 0 else 0:.1f}% |
## Uploads
| Metric | Value |
|--------|-------|
| Total Uploads | {total_uploads} |
| Deduplicated Uploads | {deduplicated_uploads} |
| Deduplication Ratio | {total_uploads / unique_artifacts if unique_artifacts > 0 else 1:.2f}x |
## Top Referenced Artifacts
| Artifact ID | Size | References | Savings |
|-------------|------|------------|---------|
"""
for art in top_artifacts:
savings = (art.ref_count - 1) * art.size
report += f"| `{art.id[:12]}...` | {format_bytes(art.size)} | {art.ref_count} | {format_bytes(savings)} |\n"
return StatsReportResponse(
format="markdown",
generated_at=generated_at,
content=report,
)
# JSON format
return StatsReportResponse(
format="json",
generated_at=generated_at,
content=json.dumps(
{
"overview": {
"projects": project_count,
"packages": package_count,
"total_artifacts": total_artifacts,
"unique_artifacts": unique_artifacts,
"orphaned_artifacts": orphaned_artifacts,
},
"storage": {
"total_bytes": total_size,
"saved_bytes": storage_saved,
"savings_percentage": (
storage_saved / (total_size + storage_saved) * 100
)
if (total_size + storage_saved) > 0
else 0,
},
"uploads": {
"total": total_uploads,
"deduplicated": deduplicated_uploads,
"ratio": total_uploads / unique_artifacts
if unique_artifacts > 0
else 1,
},
"top_artifacts": [
{
"id": art.id,
"size": art.size,
"ref_count": art.ref_count,
"savings": (art.ref_count - 1) * art.size,
}
for art in top_artifacts
],
},
indent=2,
),
) )

View File

@@ -456,3 +456,62 @@ class ProjectStatsResponse(BaseModel):
total_size_bytes: int total_size_bytes: int
upload_count: int upload_count: int
deduplicated_uploads: int deduplicated_uploads: int
storage_saved_bytes: int = 0 # Bytes saved through deduplication
deduplication_ratio: float = 1.0 # upload_count / artifact_count
class PackageStatsResponse(BaseModel):
"""Per-package statistics"""
package_id: str
package_name: str
project_name: str
tag_count: int
artifact_count: int
total_size_bytes: int
upload_count: int
deduplicated_uploads: int
storage_saved_bytes: int = 0
deduplication_ratio: float = 1.0
class ArtifactStatsResponse(BaseModel):
"""Per-artifact reference statistics"""
artifact_id: str
sha256: str
size: int
ref_count: int
storage_savings: int # (ref_count - 1) * size
tags: List[Dict[str, Any]] # Tags referencing this artifact
projects: List[str] # Projects using this artifact
packages: List[str] # Packages using this artifact
first_uploaded: Optional[datetime] = None
last_referenced: Optional[datetime] = None
class CrossProjectDeduplicationResponse(BaseModel):
"""Cross-project deduplication statistics"""
shared_artifacts_count: int # Artifacts used in multiple projects
total_cross_project_savings: int # Bytes saved by cross-project sharing
shared_artifacts: List[Dict[str, Any]] # Details of shared artifacts
class TimeBasedStatsResponse(BaseModel):
"""Time-based deduplication statistics"""
period: str # "daily", "weekly", "monthly"
start_date: datetime
end_date: datetime
data_points: List[
Dict[str, Any]
] # List of {date, uploads, unique, duplicated, bytes_saved}
class StatsReportResponse(BaseModel):
"""Summary report in various formats"""
format: str # "json", "csv", "markdown"
generated_at: datetime
content: str # The report content