From be2d8d70cba0e6588e43d2febb092c37d53cbd0e Mon Sep 17 00:00:00 2001
From: Jose Henrique <jose.henrique.ivan@gmail.com>
Date: Tue, 31 Mar 2026 16:42:16 -0300
Subject: [PATCH] adding buckets!

---
 packages/backend/src/agents/ranking.ts        | 18 ++--
 .../backend/src/pipelines/recommendation.ts   | 86 ++++++++++++++++---
 2 files changed, 84 insertions(+), 20 deletions(-)
diff --git a/packages/backend/src/agents/ranking.ts b/packages/backend/src/agents/ranking.ts
index b5b0683..dc89858 100644
--- a/packages/backend/src/agents/ranking.ts
+++ b/packages/backend/src/agents/ranking.ts
@@ -31,7 +31,7 @@ export async function runRanking(
     chunks.push(filtered.slice(i, i + CHUNK_SIZE));
   }
 
-  const allBuckets: RankingOutput = {
+  const allTags: RankingOutput = {
     definitely_like: [],
     might_like: [],
     questionable: [],
@@ -46,15 +46,15 @@ export async function runRanking(
       temperature: 0.2,
       ...serviceOptions,
       text: { format: zodTextFormat(RankingSchema, "ranking") },
-      instructions: `You are a ${mediaLabel} ranking critic. Assign each ${mediaLabel} to exactly one of four confidence buckets based on how well it matches the user's preferences.
+      instructions: `You are a ${mediaLabel} ranking critic. Assign each ${mediaLabel} to exactly one of four confidence tags based on how well it matches the user's preferences.
 
-Buckets:
+Tags:
 - "definitely_like": Near-perfect match to all preferences
 - "might_like": Strong match to most preferences
 - "questionable": Partial alignment, some aspects don't match
 - "will_not_like": Likely mismatch, conflicts with preferences or avoidance criteria
 
-Every ${mediaLabel} in the input must appear in exactly one bucket. Use the title exactly as given.`,
+Every ${mediaLabel} in the input must appear in exactly one tag. Use the title exactly as given.`,
       input: `User preferences:
 Liked ${mediaLabel}s: ${JSON.stringify(interpreter.liked)}
 Themes: ${JSON.stringify(interpreter.themes)}
@@ -68,11 +68,11 @@ ${chunkTitles}`,
 
     const chunkResult = (response.output_parsed as Partial<RankingOutput>) ?? {};
 
-    allBuckets.definitely_like.push(...(chunkResult.definitely_like ?? []));
-    allBuckets.might_like.push(...(chunkResult.might_like ?? []));
-    allBuckets.questionable.push(...(chunkResult.questionable ?? []));
-    allBuckets.will_not_like.push(...(chunkResult.will_not_like ?? []));
+    allTags.definitely_like.push(...(chunkResult.definitely_like ?? []));
+    allTags.might_like.push(...(chunkResult.might_like ?? []));
+    allTags.questionable.push(...(chunkResult.questionable ?? []));
+    allTags.will_not_like.push(...(chunkResult.will_not_like ?? []));
   }
 
-  return allBuckets;
+  return allTags;
 }
diff --git a/packages/backend/src/pipelines/recommendation.ts b/packages/backend/src/pipelines/recommendation.ts
index de0bee9..9840170 100644
--- a/packages/backend/src/pipelines/recommendation.ts
+++ b/packages/backend/src/pipelines/recommendation.ts
@@ -5,7 +5,7 @@ import { runInterpreter } from '../agents/interpreter.js';
 import { runRetrieval } from '../agents/retrieval.js';
 import { runRanking } from '../agents/ranking.js';
 import { runCurator } from '../agents/curator.js';
-import type { CuratorOutput, MediaType, SSEEvent } from '../types/agents.js';
+import type { CuratorOutput, MediaType, RankingOutput, RetrievalCandidate, SSEEvent } from '../types/agents.js';
 import { generateTitle } from '../agents/titleGenerator.js';
 
 /* -- Agent pipeline --
@@ -17,6 +17,29 @@ import { generateTitle } from '../agents/titleGenerator.js';
 
 type RecommendationRecord = typeof recommendations.$inferSelect;
 
+function getBucketCount(count: number): number {
+  if (count <= 50) return 1;
+  if (count <= 100) return 2;
+  if (count <= 150) return 3;
+  return 4;
+}
+
+function deduplicateCandidates(candidates: RetrievalCandidate[]): RetrievalCandidate[] {
+  const seen = new Set<string>();
+  return candidates.filter((c) => {
+    const key = c.title.toLowerCase();
+    if (seen.has(key)) return false;
+    seen.add(key);
+    return true;
+  });
+}
+
+function splitIntoBuckets<T>(items: T[], n: number): T[][] {
+  const size = Math.ceil(items.length / n);
+  return Array.from({ length: n }, (_, i) => items.slice(i * size, (i + 1) * size))
+    .filter((b) => b.length > 0);
+}
+
 function log(recId: string, msg: string, data?: unknown) {
   const ts = new Date().toISOString();
   if (data !== undefined) {
@@ -68,24 +91,45 @@ export async function runPipeline(
     });
     sseWrite({ stage: 'interpreter', status: 'done', data: interpreterOutput });
 
-    // --- Retrieval ---
+    // --- Retrieval (bucketed) ---
     currentStage = 'retrieval';
     log(rec.id, 'Retrieval: start');
     sseWrite({ stage: 'retrieval', status: 'start' });
     const t1 = Date.now();
-    const retrievalOutput = await runRetrieval(interpreterOutput, rec.brainstorm_count, mediaType, useWebSearch);
-    log(rec.id, `Retrieval: done (${Date.now() - t1}ms) — ${retrievalOutput.candidates.length} candidates`, {
-      titles: retrievalOutput.candidates.map((c) => c.title),
+    const retrievalBucketCount = getBucketCount(rec.brainstorm_count);
+    const perBucketCount = Math.ceil(rec.brainstorm_count / retrievalBucketCount);
+    const retrievalBuckets = await Promise.all(
+      Array.from({ length: retrievalBucketCount }, () =>
+        runRetrieval(interpreterOutput, perBucketCount, mediaType, useWebSearch)
+      )
+    );
+    const allCandidates = retrievalBuckets.flatMap((r) => r.candidates);
+    const dedupedCandidates = deduplicateCandidates(allCandidates);
+    const retrievalOutput = { candidates: dedupedCandidates };
+    log(rec.id, `Retrieval: done (${Date.now() - t1}ms) — ${dedupedCandidates.length} candidates (${retrievalBucketCount} buckets, ${allCandidates.length} before dedup)`, {
+      titles: dedupedCandidates.map((c) => c.title),
     });
     sseWrite({ stage: 'retrieval', status: 'done', data: retrievalOutput });
 
-    // --- Ranking ---
+    // --- Ranking (bucketed) ---
     currentStage = 'ranking';
     log(rec.id, 'Ranking: start');
     sseWrite({ stage: 'ranking', status: 'start' });
     const t2 = Date.now();
-    const rankingOutput = await runRanking(interpreterOutput, retrievalOutput, mediaType);
-    log(rec.id, `Ranking: done (${Date.now() - t2}ms)`, {
+    const rankBucketCount = getBucketCount(dedupedCandidates.length);
+    const candidateBuckets = splitIntoBuckets(dedupedCandidates, rankBucketCount);
+    const rankingBuckets = await Promise.all(
+      candidateBuckets.map((bucket) =>
+        runRanking(interpreterOutput, { candidates: bucket }, mediaType)
+      )
+    );
+    const rankingOutput: RankingOutput = {
+      definitely_like: rankingBuckets.flatMap((r) => r.definitely_like),
+      might_like: rankingBuckets.flatMap((r) => r.might_like),
+      questionable: rankingBuckets.flatMap((r) => r.questionable),
+      will_not_like: rankingBuckets.flatMap((r) => r.will_not_like),
+    };
+    log(rec.id, `Ranking: done (${Date.now() - t2}ms) — ${rankBucketCount} buckets`, {
       definitely_like: rankingOutput.definitely_like.length,
       might_like: rankingOutput.might_like.length,
       questionable: rankingOutput.questionable.length,
@@ -93,13 +137,33 @@ export async function runPipeline(
     });
     sseWrite({ stage: 'ranking', status: 'done', data: rankingOutput });
 
-    // --- Curator ---
+    // --- Curator (bucketed) ---
     currentStage = 'curator';
     log(rec.id, 'Curator: start');
     sseWrite({ stage: 'curator', status: 'start' });
     const t3 = Date.now();
-    const curatorOutput = await runCurator(rankingOutput, interpreterOutput, mediaType, useWebSearch);
-    log(rec.id, `Curator: done (${Date.now() - t3}ms) — ${curatorOutput.length} items curated`);
+    type CategorizedItem = { title: string; category: keyof RankingOutput };
+    const categorizedItems: CategorizedItem[] = [
+      ...rankingOutput.definitely_like.map((t) => ({ title: t, category: 'definitely_like' as const })),
+      ...rankingOutput.might_like.map((t) => ({ title: t, category: 'might_like' as const })),
+      ...rankingOutput.questionable.map((t) => ({ title: t, category: 'questionable' as const })),
+      ...rankingOutput.will_not_like.map((t) => ({ title: t, category: 'will_not_like' as const })),
+    ];
+    const curatorBucketCount = getBucketCount(categorizedItems.length);
+    const curatorItemBuckets = splitIntoBuckets(categorizedItems, curatorBucketCount);
+    const curatorBucketRankings: RankingOutput[] = curatorItemBuckets.map((bucket) => ({
+      definitely_like: bucket.filter((i) => i.category === 'definitely_like').map((i) => i.title),
+      might_like: bucket.filter((i) => i.category === 'might_like').map((i) => i.title),
+      questionable: bucket.filter((i) => i.category === 'questionable').map((i) => i.title),
+      will_not_like: bucket.filter((i) => i.category === 'will_not_like').map((i) => i.title),
+    }));
+    const curatorBucketOutputs = await Promise.all(
+      curatorBucketRankings.map((ranking) =>
+        runCurator(ranking, interpreterOutput, mediaType, useWebSearch)
+      )
+    );
+    const curatorOutput = curatorBucketOutputs.flat();
+    log(rec.id, `Curator: done (${Date.now() - t3}ms) — ${curatorOutput.length} items curated (${curatorBucketCount} buckets)`);
     sseWrite({ stage: 'curator', status: 'done', data: curatorOutput });
 
     // Generate AI title