new things

2026-04-02 19:24:58 -03:00
parent 91870f4046
commit ba38092784
19 changed files with 695 additions and 130 deletions
--- a/packages/backend/src/pipelines/recommendation.ts
+++ b/packages/backend/src/pipelines/recommendation.ts
@@ -3,14 +3,16 @@ import { db } from '../db.js';
 import { recommendations } from '../db/schema.js';
 import { runInterpreter } from '../agents/interpreter.js';
 import { runRetrieval } from '../agents/retrieval.js';
+import { runValidator } from '../agents/validator.js';
 import { runRanking } from '../agents/ranking.js';
 import { runCurator } from '../agents/curator.js';
-import type { CuratorOutput, MediaType, RankingOutput, RetrievalCandidate, SSEEvent } from '../types/agents.js';
+import type { CuratorOutput, InterpreterOutput, MediaType, RankingOutput, RetrievalCandidate, SSEEvent } from '../types/agents.js';
 import { generateTitle } from '../agents/titleGenerator.js';

 /* -- Agent pipeline --
 [1] Interpreter -> gets user input, transforms into structured data
 [2] Retrieval -> gets candidates from OpenAI (high temperature)
+[2.5] Validator (optional) -> verifies candidates exist, removes trash
 [3] Ranking -> ranks candidates based on user input
 [4] Curator -> curates candidates based on user input
 */
@@ -24,8 +26,8 @@ function getBucketCount(count: number): number {
  return 4;
 }

-function deduplicateCandidates(candidates: RetrievalCandidate[]): RetrievalCandidate[] {
-  const seen = new Set<string>();
+function deduplicateCandidates(candidates: RetrievalCandidate[], seenTitles?: Set<string>): RetrievalCandidate[] {
+  const seen = seenTitles ?? new Set<string>();
  return candidates.filter((c) => {
    const key = c.title.toLowerCase();
    if (seen.has(key)) return false;
@@ -40,6 +42,11 @@ function splitIntoBuckets<T>(items: T[], n: number): T[][] {
    .filter((b) => b.length > 0);
 }

+function mergeCuratorOutputs(a: CuratorOutput[], b: CuratorOutput[]): CuratorOutput[] {
+  const seen = new Set(a.map((x) => x.title.toLowerCase()));
+  return [...a, ...b.filter((x) => !seen.has(x.title.toLowerCase()))];
+}
+
 function log(recId: string, msg: string, data?: unknown) {
  const ts = new Date().toISOString();
  if (data !== undefined) {
@@ -49,6 +56,123 @@ function log(recId: string, msg: string, data?: unknown) {
  }
 }

+interface SubPipelineCtx {
+  recId: string;
+  interpreterOutput: InterpreterOutput;
+  mediaType: MediaType;
+  useWebSearch: boolean;
+  useValidator: boolean;
+  useHardRequirements: boolean;
+  brainstormCount: number;
+  previousFullMatches: string[];
+  allSeenTitles: Set<string>;
+  stagePrefix: string;
+  sseWrite: (event: SSEEvent) => void;
+}
+
+async function runSubPipeline(ctx: SubPipelineCtx): Promise<CuratorOutput[]> {
+  const {
+    recId, interpreterOutput, mediaType, useWebSearch, useValidator,
+    useHardRequirements, brainstormCount, previousFullMatches,
+    allSeenTitles, stagePrefix, sseWrite,
+  } = ctx;
+
+  const p = (stage: string) => (stagePrefix + stage) as SSEEvent['stage'];
+
+  // --- Retrieval (bucketed) ---
+  log(recId, `${stagePrefix}Retrieval: start`);
+  sseWrite({ stage: p('retrieval'), status: 'start' });
+  const t1 = Date.now();
+  const retrievalBucketCount = getBucketCount(brainstormCount);
+  const perBucketCount = Math.ceil(brainstormCount / retrievalBucketCount);
+  const retrievalBuckets = await Promise.all(
+    Array.from({ length: retrievalBucketCount }, () =>
+      runRetrieval(interpreterOutput, perBucketCount, mediaType, useWebSearch, useHardRequirements, previousFullMatches)
+    )
+  );
+  const allCandidates = retrievalBuckets.flatMap((r) => r.candidates);
+  const dedupedCandidates = deduplicateCandidates(allCandidates, allSeenTitles);
+  log(recId, `${stagePrefix}Retrieval: done (${Date.now() - t1}ms) — ${dedupedCandidates.length} candidates (${retrievalBucketCount} buckets, ${allCandidates.length} before dedup)`, {
+    titles: dedupedCandidates.map((c) => c.title),
+  });
+  sseWrite({ stage: p('retrieval'), status: 'done', data: { candidates: dedupedCandidates } });
+
+  // --- Validator (optional) ---
+  let candidatesForRanking = dedupedCandidates;
+  if (useValidator) {
+    log(recId, `${stagePrefix}Validator: start`);
+    sseWrite({ stage: p('validator'), status: 'start' });
+    const tV = Date.now();
+    const validatorOutput = await runValidator(dedupedCandidates, mediaType);
+    const verified = validatorOutput.candidates.filter((c) => !c.isTrash);
+    const trashCount = validatorOutput.candidates.length - verified.length;
+    candidatesForRanking = verified.map(({ title, reason }) => ({ title, reason }));
+    log(recId, `${stagePrefix}Validator: done (${Date.now() - tV}ms) — removed ${trashCount} trash entries`);
+    sseWrite({ stage: p('validator'), status: 'done', data: { removed: trashCount } });
+  } else {
+    sseWrite({ stage: p('validator'), status: 'done', data: { skipped: true } });
+  }
+
+  // --- Ranking (bucketed) ---
+  log(recId, `${stagePrefix}Ranking: start`);
+  sseWrite({ stage: p('ranking'), status: 'start' });
+  const t2 = Date.now();
+  const rankBucketCount = getBucketCount(candidatesForRanking.length);
+  const candidateBuckets = splitIntoBuckets(candidatesForRanking, rankBucketCount);
+  const rankingBuckets = await Promise.all(
+    candidateBuckets.map((bucket) =>
+      runRanking(interpreterOutput, { candidates: bucket }, mediaType, useHardRequirements)
+    )
+  );
+  const rankingOutput: RankingOutput = {
+    full_match: rankingBuckets.flatMap((r) => r.full_match),
+    definitely_like: rankingBuckets.flatMap((r) => r.definitely_like),
+    might_like: rankingBuckets.flatMap((r) => r.might_like),
+    questionable: rankingBuckets.flatMap((r) => r.questionable),
+    will_not_like: rankingBuckets.flatMap((r) => r.will_not_like),
+  };
+  log(recId, `${stagePrefix}Ranking: done (${Date.now() - t2}ms) — ${rankBucketCount} buckets`, {
+    full_match: rankingOutput.full_match.length,
+    definitely_like: rankingOutput.definitely_like.length,
+    might_like: rankingOutput.might_like.length,
+    questionable: rankingOutput.questionable.length,
+    will_not_like: rankingOutput.will_not_like.length,
+  });
+  sseWrite({ stage: p('ranking'), status: 'done', data: rankingOutput });
+
+  // --- Curator (bucketed) ---
+  log(recId, `${stagePrefix}Curator: start`);
+  sseWrite({ stage: p('curator'), status: 'start' });
+  const t3 = Date.now();
+  type CategorizedItem = { title: string; category: keyof RankingOutput };
+  const categorizedItems: CategorizedItem[] = [
+    ...rankingOutput.full_match.map((t) => ({ title: t, category: 'full_match' as const })),
+    ...rankingOutput.definitely_like.map((t) => ({ title: t, category: 'definitely_like' as const })),
+    ...rankingOutput.might_like.map((t) => ({ title: t, category: 'might_like' as const })),
+    ...rankingOutput.questionable.map((t) => ({ title: t, category: 'questionable' as const })),
+    ...rankingOutput.will_not_like.map((t) => ({ title: t, category: 'will_not_like' as const })),
+  ];
+  const curatorBucketCount = getBucketCount(categorizedItems.length);
+  const curatorItemBuckets = splitIntoBuckets(categorizedItems, curatorBucketCount);
+  const curatorBucketRankings: RankingOutput[] = curatorItemBuckets.map((bucket) => ({
+    full_match: bucket.filter((i) => i.category === 'full_match').map((i) => i.title),
+    definitely_like: bucket.filter((i) => i.category === 'definitely_like').map((i) => i.title),
+    might_like: bucket.filter((i) => i.category === 'might_like').map((i) => i.title),
+    questionable: bucket.filter((i) => i.category === 'questionable').map((i) => i.title),
+    will_not_like: bucket.filter((i) => i.category === 'will_not_like').map((i) => i.title),
+  }));
+  const curatorBucketOutputs = await Promise.all(
+    curatorBucketRankings.map((ranking) =>
+      runCurator(ranking, interpreterOutput, mediaType, useWebSearch)
+    )
+  );
+  const curatorOutput = curatorBucketOutputs.flat();
+  log(recId, `${stagePrefix}Curator: done (${Date.now() - t3}ms) — ${curatorOutput.length} items curated (${curatorBucketCount} buckets)`);
+  sseWrite({ stage: p('curator'), status: 'done', data: curatorOutput });
+
+  return curatorOutput;
+}
+
 export async function runPipeline(
  rec: RecommendationRecord,
  sseWrite: (event: SSEEvent) => void,
@@ -58,8 +182,11 @@ export async function runPipeline(
  const startTime = Date.now();
  const mediaType = (rec.media_type ?? 'tv_show') as MediaType;
  const useWebSearch = rec.use_web_search ?? false;
+  const useValidator = rec.use_validator ?? false;
+  const useHardRequirements = rec.hard_requirements ?? false;
+  const selfExpansive = rec.self_expansive ?? false;

-  log(rec.id, `Starting pipeline for "${rec.title}" [${mediaType}${useWebSearch ? ', web_search' : ''}]${feedbackContext ? ' (with feedback context)' : ''}`);
+  log(rec.id, `Starting pipeline for "${rec.title}" [${mediaType}${useWebSearch ? ', web_search' : ''}${useValidator ? ', validator' : ''}${useHardRequirements ? ', hard_req' : ''}${selfExpansive ? `, expansive×${rec.expansive_passes}(${rec.expansive_mode})` : ''}]${feedbackContext ? ' (with feedback context)' : ''}`);

  try {
    // Set status to running
@@ -91,84 +218,69 @@ export async function runPipeline(
    });
    sseWrite({ stage: 'interpreter', status: 'done', data: interpreterOutput });

-    // --- Retrieval (bucketed) ---
+    // --- Pass 1: Retrieval → [Validator?] → Ranking → Curator ---
    currentStage = 'retrieval';
-    log(rec.id, 'Retrieval: start');
-    sseWrite({ stage: 'retrieval', status: 'start' });
-    const t1 = Date.now();
-    const retrievalBucketCount = getBucketCount(rec.brainstorm_count);
-    const perBucketCount = Math.ceil(rec.brainstorm_count / retrievalBucketCount);
-    const retrievalBuckets = await Promise.all(
-      Array.from({ length: retrievalBucketCount }, () =>
-        runRetrieval(interpreterOutput, perBucketCount, mediaType, useWebSearch)
-      )
-    );
-    const allCandidates = retrievalBuckets.flatMap((r) => r.candidates);
-    const dedupedCandidates = deduplicateCandidates(allCandidates);
-    const retrievalOutput = { candidates: dedupedCandidates };
-    log(rec.id, `Retrieval: done (${Date.now() - t1}ms) — ${dedupedCandidates.length} candidates (${retrievalBucketCount} buckets, ${allCandidates.length} before dedup)`, {
-      titles: dedupedCandidates.map((c) => c.title),
+    const allSeenTitles = new Set<string>();
+    const pass1Output = await runSubPipeline({
+      recId: rec.id,
+      interpreterOutput,
+      mediaType,
+      useWebSearch,
+      useValidator,
+      useHardRequirements,
+      brainstormCount: rec.brainstorm_count,
+      previousFullMatches: [],
+      allSeenTitles,
+      stagePrefix: '',
+      sseWrite: (event) => {
+        currentStage = event.stage;
+        sseWrite(event);
+      },
    });
-    sseWrite({ stage: 'retrieval', status: 'done', data: retrievalOutput });

-    // --- Ranking (bucketed) ---
-    currentStage = 'ranking';
-    log(rec.id, 'Ranking: start');
-    sseWrite({ stage: 'ranking', status: 'start' });
-    const t2 = Date.now();
-    const rankBucketCount = getBucketCount(dedupedCandidates.length);
-    const candidateBuckets = splitIntoBuckets(dedupedCandidates, rankBucketCount);
-    const rankingBuckets = await Promise.all(
-      candidateBuckets.map((bucket) =>
-        runRanking(interpreterOutput, { candidates: bucket }, mediaType)
-      )
-    );
-    const rankingOutput: RankingOutput = {
-      full_match: rankingBuckets.flatMap((r) => r.full_match),
-      definitely_like: rankingBuckets.flatMap((r) => r.definitely_like),
-      might_like: rankingBuckets.flatMap((r) => r.might_like),
-      questionable: rankingBuckets.flatMap((r) => r.questionable),
-      will_not_like: rankingBuckets.flatMap((r) => r.will_not_like),
-    };
-    log(rec.id, `Ranking: done (${Date.now() - t2}ms) — ${rankBucketCount} buckets`, {
-      full_match: rankingOutput.full_match.length,
-      definitely_like: rankingOutput.definitely_like.length,
-      might_like: rankingOutput.might_like.length,
-      questionable: rankingOutput.questionable.length,
-      will_not_like: rankingOutput.will_not_like.length,
-    });
-    sseWrite({ stage: 'ranking', status: 'done', data: rankingOutput });
+    let mergedOutput = pass1Output;

-    // --- Curator (bucketed) ---
-    currentStage = 'curator';
-    log(rec.id, 'Curator: start');
-    sseWrite({ stage: 'curator', status: 'start' });
-    const t3 = Date.now();
-    type CategorizedItem = { title: string; category: keyof RankingOutput };
-    const categorizedItems: CategorizedItem[] = [
-      ...rankingOutput.full_match.map((t) => ({ title: t, category: 'full_match' as const })),
-      ...rankingOutput.definitely_like.map((t) => ({ title: t, category: 'definitely_like' as const })),
-      ...rankingOutput.might_like.map((t) => ({ title: t, category: 'might_like' as const })),
-      ...rankingOutput.questionable.map((t) => ({ title: t, category: 'questionable' as const })),
-      ...rankingOutput.will_not_like.map((t) => ({ title: t, category: 'will_not_like' as const })),
-    ];
-    const curatorBucketCount = getBucketCount(categorizedItems.length);
-    const curatorItemBuckets = splitIntoBuckets(categorizedItems, curatorBucketCount);
-    const curatorBucketRankings: RankingOutput[] = curatorItemBuckets.map((bucket) => ({
-      full_match: bucket.filter((i) => i.category === 'full_match').map((i) => i.title),
-      definitely_like: bucket.filter((i) => i.category === 'definitely_like').map((i) => i.title),
-      might_like: bucket.filter((i) => i.category === 'might_like').map((i) => i.title),
-      questionable: bucket.filter((i) => i.category === 'questionable').map((i) => i.title),
-      will_not_like: bucket.filter((i) => i.category === 'will_not_like').map((i) => i.title),
-    }));
-    const curatorBucketOutputs = await Promise.all(
-      curatorBucketRankings.map((ranking) =>
-        runCurator(ranking, interpreterOutput, mediaType, useWebSearch)
-      )
-    );
-    const curatorOutput = curatorBucketOutputs.flat();
-    log(rec.id, `Curator: done (${Date.now() - t3}ms) — ${curatorOutput.length} items curated (${curatorBucketCount} buckets)`);
-    sseWrite({ stage: 'curator', status: 'done', data: curatorOutput });
+    // --- Self Expansive: extra passes ---
+    if (selfExpansive && rec.expansive_passes > 0) {
+      const allFullMatches = pass1Output
+        .filter((c) => c.category === 'Full Match')
+        .map((c) => c.title);
+
+      for (let i = 0; i < rec.expansive_passes; i++) {
+        const passNum = i + 2;
+        const passCount = rec.expansive_mode === 'extreme' ? rec.brainstorm_count : 60;
+        const passPrefix = `pass${passNum}:` as const;
+
+        log(rec.id, `Self Expansive Pass ${passNum}: start (${passCount} candidates, ${allFullMatches.length} full matches as context)`);
+        currentStage = `${passPrefix}retrieval` as SSEEvent['stage'];
+
+        const passOutput = await runSubPipeline({
+          recId: rec.id,
+          interpreterOutput,
+          mediaType,
+          useWebSearch,
+          useValidator,
+          useHardRequirements,
+          brainstormCount: passCount,
+          previousFullMatches: [...allFullMatches],
+          allSeenTitles,
+          stagePrefix: passPrefix,
+          sseWrite: (event) => {
+            currentStage = event.stage;
+            sseWrite(event);
+          },
+        });
+
+        mergedOutput = mergeCuratorOutputs(mergedOutput, passOutput);
+
+        const newFullMatches = passOutput
+          .filter((c) => c.category === 'Full Match')
+          .map((c) => c.title);
+        allFullMatches.push(...newFullMatches);
+
+        log(rec.id, `Self Expansive Pass ${passNum}: done — ${passOutput.length} new items, ${mergedOutput.length} total`);
+      }
+    }

    // Generate AI title
    let aiTitle: string = rec.title;
@@ -180,17 +292,27 @@ export async function runPipeline(
      log(rec.id, `Title generation failed, keeping initial title: ${String(err)}`);
    }

+    // Sort by category order before saving
+    const CATEGORY_ORDER: Record<string, number> = {
+      'Full Match': 0,
+      'Definitely Like': 1,
+      'Might Like': 2,
+      'Questionable': 3,
+      'Will Not Like': 4,
+    };
+    mergedOutput.sort((a, b) => (CATEGORY_ORDER[a.category] ?? 99) - (CATEGORY_ORDER[b.category] ?? 99));
+
    // Save results to DB
    log(rec.id, 'Saving results to DB');
    await db
      .update(recommendations)
-      .set({ recommendations: curatorOutput, status: 'done', title: aiTitle })
+      .set({ recommendations: mergedOutput, status: 'done', title: aiTitle })
      .where(eq(recommendations.id, rec.id));

    sseWrite({ stage: 'complete', status: 'done', data: { title: aiTitle } });

    log(rec.id, `Pipeline complete (total: ${Date.now() - startTime}ms)`);
-    return curatorOutput;
+    return mergedOutput;
  } catch (err) {
    const message = err instanceof Error ? err.message : String(err);
    log(rec.id, `Pipeline error at stage "${currentStage}": ${message}`);