prmbr-image-mksaas/src/app/api/analyze-content/route.ts

461 lines
14 KiB
TypeScript

import {
ErrorSeverity,
ErrorType,
WebContentAnalyzerError,
classifyError,
logError,
withRetry,
} from '@/ai/text/utils/error-handling';
import {
type AnalysisResults,
type AnalyzeContentResponse,
analyzeContentRequestSchema,
validateUrl,
} from '@/ai/text/utils/web-content-analyzer';
import {
validateFirecrawlConfig,
webContentAnalyzerConfig,
} from '@/ai/text/utils/web-content-analyzer-config';
import { createDeepSeek } from '@ai-sdk/deepseek';
import { createGoogleGenerativeAI } from '@ai-sdk/google';
import { createOpenAI } from '@ai-sdk/openai';
import FirecrawlApp from '@mendable/firecrawl-js';
import { createOpenRouter } from '@openrouter/ai-sdk-provider';
import { generateObject } from 'ai';
import { type NextRequest, NextResponse } from 'next/server';
import { z } from 'zod';
// Constants from configuration
const TIMEOUT_MILLIS = webContentAnalyzerConfig.timeoutMillis;
const MAX_CONTENT_LENGTH = webContentAnalyzerConfig.maxContentLength;
// Initialize Firecrawl client
const getFirecrawlClient = () => {
if (!validateFirecrawlConfig()) {
throw new Error('Firecrawl API key is not configured');
}
return new FirecrawlApp({
apiKey: webContentAnalyzerConfig.firecrawl.apiKey,
});
};
// AI analysis schema for structured output
const analysisSchema = z.object({
title: z.string().describe('Main title or product name from the webpage'),
description: z.string().describe('Brief description in 1-2 sentences'),
introduction: z
.string()
.describe('Detailed introduction paragraph about the content'),
features: z.array(z.string()).describe('List of key features or highlights'),
pricing: z
.string()
.describe('Pricing information or "Not specified" if unavailable'),
useCases: z.array(z.string()).describe('List of use cases or applications'),
});
// Timeout wrapper
const withTimeout = <T>(
promise: Promise<T>,
timeoutMillis: number
): Promise<T> => {
return Promise.race([
promise,
new Promise<T>((_, reject) =>
setTimeout(() => reject(new Error('Request timed out')), timeoutMillis)
),
]);
};
// Enhanced content truncation with intelligent boundary detection
const truncateContent = (content: string, maxLength: number): string => {
if (content.length <= maxLength) {
return content;
}
const { contentTruncation } = webContentAnalyzerConfig;
const preferredLength = Math.floor(
maxLength * contentTruncation.preferredTruncationPoint
);
// If content is shorter than minimum threshold, use simple truncation
if (content.length < contentTruncation.minContentLength) {
return content.substring(0, maxLength) + '...';
}
// Try to find the best truncation point
const truncated = content.substring(0, preferredLength);
// First, try to truncate at sentence boundaries
const sentences = content.split(/[.!?]+/);
if (sentences.length > 1) {
let sentenceLength = 0;
let sentenceCount = 0;
for (const sentence of sentences) {
const nextLength = sentenceLength + sentence.length + 1; // +1 for punctuation
if (
nextLength > maxLength ||
sentenceCount >= contentTruncation.maxSentences
) {
break;
}
sentenceLength = nextLength;
sentenceCount++;
}
if (sentenceLength > preferredLength) {
return sentences.slice(0, sentenceCount).join('.') + '.';
}
}
// If sentence boundary doesn't work well, try paragraph boundaries
const paragraphs = content.split(/\n\s*\n/);
if (paragraphs.length > 1) {
let paragraphLength = 0;
for (let i = 0; i < paragraphs.length; i++) {
const nextLength = paragraphLength + paragraphs[i].length + 2; // +2 for \n\n
if (nextLength > maxLength) {
break;
}
paragraphLength = nextLength;
if (paragraphLength > preferredLength) {
return paragraphs.slice(0, i + 1).join('\n\n');
}
}
}
// Fallback to word boundary truncation
const words = truncated.split(' ');
const lastCompleteWord = words.slice(0, -1).join(' ');
if (lastCompleteWord.length > preferredLength) {
return lastCompleteWord + '...';
}
// Final fallback to character truncation
return content.substring(0, maxLength) + '...';
};
// Scrape webpage using Firecrawl with retry logic
async function scrapeWebpage(
url: string
): Promise<{ content: string; screenshot?: string }> {
return withRetry(async () => {
const firecrawl = getFirecrawlClient();
try {
const scrapeResponse = await firecrawl.scrapeUrl(url, {
formats: ['markdown', 'screenshot'],
onlyMainContent: webContentAnalyzerConfig.firecrawl.onlyMainContent,
waitFor: webContentAnalyzerConfig.firecrawl.waitFor,
});
if (!scrapeResponse.success) {
throw new WebContentAnalyzerError(
ErrorType.SCRAPING,
scrapeResponse.error || 'Failed to scrape webpage',
'Unable to access the webpage. Please check the URL and try again.',
ErrorSeverity.MEDIUM,
true
);
}
const content = scrapeResponse.markdown || '';
const screenshot = scrapeResponse.screenshot;
if (!content.trim()) {
throw new WebContentAnalyzerError(
ErrorType.SCRAPING,
'No content found on the webpage',
'The webpage appears to be empty or inaccessible. Please try a different URL.',
ErrorSeverity.MEDIUM,
false
);
}
return {
content: truncateContent(content, MAX_CONTENT_LENGTH),
screenshot,
};
} catch (error) {
if (error instanceof WebContentAnalyzerError) {
throw error;
}
// Classify and throw the error
throw classifyError(error);
}
});
}
// Analyze content using selected provider with retry logic
async function analyzeContent(
content: string,
url: string,
provider: string
): Promise<AnalysisResults> {
return withRetry(async () => {
try {
let model: any;
let temperature: number | undefined;
let maxTokens: number | undefined;
switch (provider) {
case 'openai':
model = createOpenAI({
apiKey: process.env.OPENAI_API_KEY,
}).chat(webContentAnalyzerConfig.openai.model);
temperature = webContentAnalyzerConfig.openai.temperature;
maxTokens = webContentAnalyzerConfig.openai.maxTokens;
break;
case 'gemini':
model = createGoogleGenerativeAI({
apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY,
}).chat(webContentAnalyzerConfig.gemini.model);
temperature = webContentAnalyzerConfig.gemini.temperature;
maxTokens = webContentAnalyzerConfig.gemini.maxTokens;
break;
case 'deepseek':
model = createDeepSeek({
apiKey: process.env.DEEPSEEK_API_KEY,
}).chat(webContentAnalyzerConfig.deepseek.model);
temperature = webContentAnalyzerConfig.deepseek.temperature;
maxTokens = webContentAnalyzerConfig.deepseek.maxTokens;
break;
case 'openrouter':
model = createOpenRouter({
apiKey: process.env.OPENROUTER_API_KEY,
}).chat(webContentAnalyzerConfig.openrouter.model);
temperature = webContentAnalyzerConfig.openrouter.temperature;
maxTokens = webContentAnalyzerConfig.openrouter.maxTokens;
break;
default:
throw new WebContentAnalyzerError(
ErrorType.VALIDATION,
'Invalid model provider',
'Please select a valid model provider.',
ErrorSeverity.MEDIUM,
false
);
}
const { object } = await generateObject({
model,
schema: analysisSchema,
prompt: `
Analyze the following webpage content and extract structured information.
URL: ${url}
Content: ${content}
Please provide accurate and relevant information based on the content. If certain information is not available, use appropriate defaults:
- For pricing: use "Not specified" if no pricing information is found
- For features and use cases: provide empty arrays if none are found
- Ensure the title and description are meaningful and based on the actual content
`,
temperature,
maxOutputTokens: maxTokens,
});
return {
...object,
url,
analyzedAt: new Date().toISOString(),
};
} catch (error) {
if (error instanceof WebContentAnalyzerError) {
throw error;
}
// Check for specific OpenAI/AI errors
if (error instanceof Error) {
const message = error.message.toLowerCase();
if (message.includes('rate limit') || message.includes('quota')) {
throw new WebContentAnalyzerError(
ErrorType.RATE_LIMIT,
error.message,
'AI service is temporarily overloaded. Please wait a moment and try again.',
ErrorSeverity.MEDIUM,
true,
error
);
}
if (message.includes('timeout') || message.includes('aborted')) {
throw new WebContentAnalyzerError(
ErrorType.TIMEOUT,
error.message,
'AI analysis timed out. Please try again with a shorter webpage.',
ErrorSeverity.MEDIUM,
true,
error
);
}
}
// Classify and throw the error
throw classifyError(error);
}
});
}
export async function POST(req: NextRequest) {
const requestId = Math.random().toString(36).substring(7);
const startTime = performance.now();
try {
// Parse and validate request
const body = await req.json();
const validationResult = analyzeContentRequestSchema.safeParse(body);
if (!validationResult.success) {
const validationError = new WebContentAnalyzerError(
ErrorType.VALIDATION,
'Invalid request parameters',
'Please provide a valid URL.',
ErrorSeverity.MEDIUM,
false
);
logError(validationError, {
requestId,
validationErrors: validationResult.error,
});
return NextResponse.json(
{
success: false,
error: validationError.userMessage,
} satisfies AnalyzeContentResponse,
{ status: 400 }
);
}
const { url, modelProvider } = validationResult.data;
console.log('modelProvider', modelProvider, 'url', url);
// Additional URL validation
const urlValidation = validateUrl(url);
if (!urlValidation.success) {
const urlError = new WebContentAnalyzerError(
ErrorType.VALIDATION,
urlValidation.error.issues[0]?.message || 'Invalid URL',
'Please enter a valid URL starting with http:// or https://',
ErrorSeverity.MEDIUM,
false
);
logError(urlError, { requestId, url });
return NextResponse.json(
{
success: false,
error: urlError.userMessage,
} satisfies AnalyzeContentResponse,
{ status: 400 }
);
}
// Check if Firecrawl is configured
if (!validateFirecrawlConfig()) {
const configError = new WebContentAnalyzerError(
ErrorType.SERVICE_UNAVAILABLE,
'Firecrawl API key is not configured',
'Web content analysis service is temporarily unavailable.',
ErrorSeverity.CRITICAL,
false
);
logError(configError, { requestId });
return NextResponse.json(
{
success: false,
error: configError.userMessage,
} satisfies AnalyzeContentResponse,
{ status: 503 }
);
}
console.log(`Starting analysis [requestId=${requestId}, url=${url}]`);
// Perform analysis with timeout and enhanced error handling
const analysisPromise = (async () => {
try {
// Step 1: Scrape webpage
const { content, screenshot } = await scrapeWebpage(url);
// Step 2: Analyze content with AI (pass provider)
const analysis = await analyzeContent(content, url, modelProvider);
return { analysis, screenshot };
} catch (error) {
// If it's already a WebContentAnalyzerError, just re-throw
if (error instanceof WebContentAnalyzerError) {
throw error;
}
// Otherwise classify the error
throw classifyError(error);
}
})();
// Apply timeout wrapper
const result = await withTimeout(analysisPromise, TIMEOUT_MILLIS);
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1);
console.log(
`Analysis completed [requestId=${requestId}, elapsed=${elapsed}s]`
);
return NextResponse.json({
success: true,
data: result,
} satisfies AnalyzeContentResponse);
} catch (error) {
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1);
// Classify the error if it's not already a WebContentAnalyzerError
const analyzedError =
error instanceof WebContentAnalyzerError ? error : classifyError(error);
// Log the error with context
logError(analyzedError, {
requestId,
elapsed: `${elapsed}s`,
url: req.url,
});
// Determine status code based on error type
let statusCode = 500;
switch (analyzedError.type) {
case ErrorType.VALIDATION:
statusCode = 400;
break;
case ErrorType.TIMEOUT:
statusCode = 408;
break;
case ErrorType.SCRAPING:
statusCode = 422;
break;
case ErrorType.RATE_LIMIT:
statusCode = 429;
break;
case ErrorType.SERVICE_UNAVAILABLE:
statusCode = 503;
break;
default:
statusCode = 500;
}
return NextResponse.json(
{
success: false,
error: analyzedError.userMessage,
} satisfies AnalyzeContentResponse,
{ status: statusCode }
);
}
}