From be2d8d70cba0e6588e43d2febb092c37d53cbd0e Mon Sep 17 00:00:00 2001 From: Jose Henrique Date: Tue, 31 Mar 2026 16:42:16 -0300 Subject: [PATCH] adding buckets! --- packages/backend/src/agents/ranking.ts | 18 ++-- .../backend/src/pipelines/recommendation.ts | 86 ++++++++++++++++--- 2 files changed, 84 insertions(+), 20 deletions(-) diff --git a/packages/backend/src/agents/ranking.ts b/packages/backend/src/agents/ranking.ts index b5b0683..dc89858 100644 --- a/packages/backend/src/agents/ranking.ts +++ b/packages/backend/src/agents/ranking.ts @@ -31,7 +31,7 @@ export async function runRanking( chunks.push(filtered.slice(i, i + CHUNK_SIZE)); } - const allBuckets: RankingOutput = { + const allTags: RankingOutput = { definitely_like: [], might_like: [], questionable: [], @@ -46,15 +46,15 @@ export async function runRanking( temperature: 0.2, ...serviceOptions, text: { format: zodTextFormat(RankingSchema, "ranking") }, - instructions: `You are a ${mediaLabel} ranking critic. Assign each ${mediaLabel} to exactly one of four confidence buckets based on how well it matches the user's preferences. + instructions: `You are a ${mediaLabel} ranking critic. Assign each ${mediaLabel} to exactly one of four confidence tags based on how well it matches the user's preferences. -Buckets: +Tags: - "definitely_like": Near-perfect match to all preferences - "might_like": Strong match to most preferences - "questionable": Partial alignment, some aspects don't match - "will_not_like": Likely mismatch, conflicts with preferences or avoidance criteria -Every ${mediaLabel} in the input must appear in exactly one bucket. Use the title exactly as given.`, +Every ${mediaLabel} in the input must appear in exactly one tag. Use the title exactly as given.`, input: `User preferences: Liked ${mediaLabel}s: ${JSON.stringify(interpreter.liked)} Themes: ${JSON.stringify(interpreter.themes)} @@ -68,11 +68,11 @@ ${chunkTitles}`, const chunkResult = (response.output_parsed as Partial) ?? {}; - allBuckets.definitely_like.push(...(chunkResult.definitely_like ?? [])); - allBuckets.might_like.push(...(chunkResult.might_like ?? [])); - allBuckets.questionable.push(...(chunkResult.questionable ?? [])); - allBuckets.will_not_like.push(...(chunkResult.will_not_like ?? [])); + allTags.definitely_like.push(...(chunkResult.definitely_like ?? [])); + allTags.might_like.push(...(chunkResult.might_like ?? [])); + allTags.questionable.push(...(chunkResult.questionable ?? [])); + allTags.will_not_like.push(...(chunkResult.will_not_like ?? [])); } - return allBuckets; + return allTags; } diff --git a/packages/backend/src/pipelines/recommendation.ts b/packages/backend/src/pipelines/recommendation.ts index de0bee9..9840170 100644 --- a/packages/backend/src/pipelines/recommendation.ts +++ b/packages/backend/src/pipelines/recommendation.ts @@ -5,7 +5,7 @@ import { runInterpreter } from '../agents/interpreter.js'; import { runRetrieval } from '../agents/retrieval.js'; import { runRanking } from '../agents/ranking.js'; import { runCurator } from '../agents/curator.js'; -import type { CuratorOutput, MediaType, SSEEvent } from '../types/agents.js'; +import type { CuratorOutput, MediaType, RankingOutput, RetrievalCandidate, SSEEvent } from '../types/agents.js'; import { generateTitle } from '../agents/titleGenerator.js'; /* -- Agent pipeline -- @@ -17,6 +17,29 @@ import { generateTitle } from '../agents/titleGenerator.js'; type RecommendationRecord = typeof recommendations.$inferSelect; +function getBucketCount(count: number): number { + if (count <= 50) return 1; + if (count <= 100) return 2; + if (count <= 150) return 3; + return 4; +} + +function deduplicateCandidates(candidates: RetrievalCandidate[]): RetrievalCandidate[] { + const seen = new Set(); + return candidates.filter((c) => { + const key = c.title.toLowerCase(); + if (seen.has(key)) return false; + seen.add(key); + return true; + }); +} + +function splitIntoBuckets(items: T[], n: number): T[][] { + const size = Math.ceil(items.length / n); + return Array.from({ length: n }, (_, i) => items.slice(i * size, (i + 1) * size)) + .filter((b) => b.length > 0); +} + function log(recId: string, msg: string, data?: unknown) { const ts = new Date().toISOString(); if (data !== undefined) { @@ -68,24 +91,45 @@ export async function runPipeline( }); sseWrite({ stage: 'interpreter', status: 'done', data: interpreterOutput }); - // --- Retrieval --- + // --- Retrieval (bucketed) --- currentStage = 'retrieval'; log(rec.id, 'Retrieval: start'); sseWrite({ stage: 'retrieval', status: 'start' }); const t1 = Date.now(); - const retrievalOutput = await runRetrieval(interpreterOutput, rec.brainstorm_count, mediaType, useWebSearch); - log(rec.id, `Retrieval: done (${Date.now() - t1}ms) — ${retrievalOutput.candidates.length} candidates`, { - titles: retrievalOutput.candidates.map((c) => c.title), + const retrievalBucketCount = getBucketCount(rec.brainstorm_count); + const perBucketCount = Math.ceil(rec.brainstorm_count / retrievalBucketCount); + const retrievalBuckets = await Promise.all( + Array.from({ length: retrievalBucketCount }, () => + runRetrieval(interpreterOutput, perBucketCount, mediaType, useWebSearch) + ) + ); + const allCandidates = retrievalBuckets.flatMap((r) => r.candidates); + const dedupedCandidates = deduplicateCandidates(allCandidates); + const retrievalOutput = { candidates: dedupedCandidates }; + log(rec.id, `Retrieval: done (${Date.now() - t1}ms) — ${dedupedCandidates.length} candidates (${retrievalBucketCount} buckets, ${allCandidates.length} before dedup)`, { + titles: dedupedCandidates.map((c) => c.title), }); sseWrite({ stage: 'retrieval', status: 'done', data: retrievalOutput }); - // --- Ranking --- + // --- Ranking (bucketed) --- currentStage = 'ranking'; log(rec.id, 'Ranking: start'); sseWrite({ stage: 'ranking', status: 'start' }); const t2 = Date.now(); - const rankingOutput = await runRanking(interpreterOutput, retrievalOutput, mediaType); - log(rec.id, `Ranking: done (${Date.now() - t2}ms)`, { + const rankBucketCount = getBucketCount(dedupedCandidates.length); + const candidateBuckets = splitIntoBuckets(dedupedCandidates, rankBucketCount); + const rankingBuckets = await Promise.all( + candidateBuckets.map((bucket) => + runRanking(interpreterOutput, { candidates: bucket }, mediaType) + ) + ); + const rankingOutput: RankingOutput = { + definitely_like: rankingBuckets.flatMap((r) => r.definitely_like), + might_like: rankingBuckets.flatMap((r) => r.might_like), + questionable: rankingBuckets.flatMap((r) => r.questionable), + will_not_like: rankingBuckets.flatMap((r) => r.will_not_like), + }; + log(rec.id, `Ranking: done (${Date.now() - t2}ms) — ${rankBucketCount} buckets`, { definitely_like: rankingOutput.definitely_like.length, might_like: rankingOutput.might_like.length, questionable: rankingOutput.questionable.length, @@ -93,13 +137,33 @@ export async function runPipeline( }); sseWrite({ stage: 'ranking', status: 'done', data: rankingOutput }); - // --- Curator --- + // --- Curator (bucketed) --- currentStage = 'curator'; log(rec.id, 'Curator: start'); sseWrite({ stage: 'curator', status: 'start' }); const t3 = Date.now(); - const curatorOutput = await runCurator(rankingOutput, interpreterOutput, mediaType, useWebSearch); - log(rec.id, `Curator: done (${Date.now() - t3}ms) — ${curatorOutput.length} items curated`); + type CategorizedItem = { title: string; category: keyof RankingOutput }; + const categorizedItems: CategorizedItem[] = [ + ...rankingOutput.definitely_like.map((t) => ({ title: t, category: 'definitely_like' as const })), + ...rankingOutput.might_like.map((t) => ({ title: t, category: 'might_like' as const })), + ...rankingOutput.questionable.map((t) => ({ title: t, category: 'questionable' as const })), + ...rankingOutput.will_not_like.map((t) => ({ title: t, category: 'will_not_like' as const })), + ]; + const curatorBucketCount = getBucketCount(categorizedItems.length); + const curatorItemBuckets = splitIntoBuckets(categorizedItems, curatorBucketCount); + const curatorBucketRankings: RankingOutput[] = curatorItemBuckets.map((bucket) => ({ + definitely_like: bucket.filter((i) => i.category === 'definitely_like').map((i) => i.title), + might_like: bucket.filter((i) => i.category === 'might_like').map((i) => i.title), + questionable: bucket.filter((i) => i.category === 'questionable').map((i) => i.title), + will_not_like: bucket.filter((i) => i.category === 'will_not_like').map((i) => i.title), + })); + const curatorBucketOutputs = await Promise.all( + curatorBucketRankings.map((ranking) => + runCurator(ranking, interpreterOutput, mediaType, useWebSearch) + ) + ); + const curatorOutput = curatorBucketOutputs.flat(); + log(rec.id, `Curator: done (${Date.now() - t3}ms) — ${curatorOutput.length} items curated (${curatorBucketCount} buckets)`); sseWrite({ stage: 'curator', status: 'done', data: curatorOutput }); // Generate AI title