diff --git a/utils/seed_data.py b/utils/seed_data.py index 0bd7eea..d653a2b 100755 --- a/utils/seed_data.py +++ b/utils/seed_data.py @@ -202,8 +202,26 @@ async def generate_seed_data(num_artifacts: int = 50) -> List[int]: print(f"Deployment mode: {settings.deployment_mode}") print(f"Storage backend: {settings.storage_backend}") - # Generate some SIM source IDs that will be reused (simulating multiple artifacts per source) - sim_sources = [f"sim_run_{uuid.uuid4().hex[:8]}" for _ in range(max(num_artifacts // 3, 1))] + # Generate SIM source IDs - each source will have 2-4 artifacts + num_sim_sources = max(num_artifacts // 3, 1) + sim_sources = [f"sim_run_{uuid.uuid4().hex[:8]}" for _ in range(num_sim_sources)] + + # Pre-assign artifacts to SIM sources to ensure grouping + sim_source_assignments = [] + for sim_source in sim_sources: + # Each SIM source gets 2-4 artifacts + num_artifacts_for_source = random.randint(2, 4) + sim_source_assignments.extend([sim_source] * num_artifacts_for_source) + + # Pad remaining artifacts with None (ungrouped) or random sources + while len(sim_source_assignments) < num_artifacts: + if random.random() < 0.3: # 30% ungrouped + sim_source_assignments.append(None) + else: + sim_source_assignments.append(random.choice(sim_sources)) + + # Shuffle to randomize order + random.shuffle(sim_source_assignments) for i in range(num_artifacts): # Randomly choose file type @@ -229,8 +247,8 @@ async def generate_seed_data(num_artifacts: int = 50) -> List[int]: # Upload to storage storage_path = await upload_artifact_to_storage(content, filename) - # Randomly assign a SIM source ID (70% chance of having one, enabling grouping) - sim_source_id = random.choice(sim_sources) if random.random() < 0.7 else None + # Get pre-assigned SIM source ID for this artifact + sim_source_id = sim_source_assignments[i] # Generate metadata artifact_data = create_artifact_data(i, sim_source_id)