Feat/rag url langchain #122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

jondwillis wants to merge 10 commits into main from feat/rag-url-langchain

ui/.vscode/launch.json

-Original file line number
+Diff line change
@@ -1,11 +1,18 @@
     {
       "version": "0.2.0",
       "configurations": [
+        {
+          "name": "Launch Chrome",
+          "request": "launch",
+          "type": "chrome",
+          "url": "http://localhost:3000",
+          "webRoot": "${workspaceFolder}"
+        },
         {
           "name": "Next.js",
           "type": "node-terminal",
           "request": "launch",
-          "command": "npm run dev",
+          "command": "npm run dev --debug-brk",
           "skipFiles": ["<node_internals>/**"]
         }
       ]
@@ Expand Down @@

ui/app/api/rag-url-with-langchain/puppeteer/route.ts

-Original file line number
+Diff line change
@@ -0,0 +1,46 @@
+    import type { NextRequest } from 'next/server';
+    import chrome from 'chrome-aws-lambda';
+    import type {
+      Browser,
+      Page,
+      PuppeteerEvaluate,
+      PuppeteerGotoOptions,
+    } from 'langchain/document_loaders/web/puppeteer';
+    import { PuppeteerWebBaseLoader } from 'langchain/document_loaders/web/puppeteer';
+    export const runtime = 'nodejs';
+    export async function POST(req: NextRequest) {
+      const htmlOrUrl = await req.json();
+      const docs = await run(htmlOrUrl);
+      return new Response(JSON.stringify(docs), {
+        headers: {
+          'content-type': 'application/json; charset=UTF-8',
+        },
+      });
+    }
+    const run = async ({ html, url }: { html?: string; url?: string }) => {
+      /**  Loader use evaluate function ` await page.evaluate(() => document.body.innerHTML);` as default evaluate */
+      const gotoOptions: PuppeteerGotoOptions | undefined = url
+        ? { waitUntil: 'domcontentloaded', timeout: 10_000 }
+        : undefined;
+      url = url ?? 'about://about';
+      const evaluate: PuppeteerEvaluate | undefined = html
+        ? async (page: Page, _browser: Browser) => {
+            return await page.evaluate(() => html);
+          }
+        : undefined;
+      const loaderWithOptions = new PuppeteerWebBaseLoader(url, {
+        launchOptions: {
+          headless: 'new',
+          timeout: 10_000,
+          executablePath: await chrome.executablePath,
+        },
+        gotoOptions,
+        /**  Pass custom evaluate , in this case you get page and browser instances */
+        evaluate,
+      });
+      const docs = await loaderWithOptions.loadAndSplit();
+      return docs;
+    };

ui/app/api/rag-url-with-langchain/route.ts

-Original file line number
+Diff line change
@@ -0,0 +1,293 @@
+    import type { NextRequest } from 'next/server';
+    import { NextResponse } from 'next/server';
+    import { ChatGoogleGenerativeAI } from '@langchain/google-genai';
+    import type { Message, Message as VercelChatMessage } from 'ai';
+    import { LangChainStream, StreamingTextResponse } from 'ai';
+    import { initializeAgentExecutorWithOptions } from 'langchain/agents';
+    import {
+      createRetrieverTool,
+      OpenAIAgentTokenBufferMemory,
+    } from 'langchain/agents/toolkits';
+    import { ChatOpenAI } from 'langchain/chat_models/openai';
+    import { Document } from 'langchain/document';
+    import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio';
+    import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
+    import { BufferMemory, ChatMessageHistory } from 'langchain/memory';
+    import { ContextualCompressionRetriever } from 'langchain/retrievers/contextual_compression';
+    import { DocumentCompressorPipeline } from 'langchain/retrievers/document_compressors';
+    import { EmbeddingsFilter } from 'langchain/retrievers/document_compressors/embeddings_filter';
+    import { AIMessage, ChatMessage, HumanMessage } from 'langchain/schema';
+    import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
+    import { MemoryVectorStore } from 'langchain/vectorstores/memory';
+    export const runtime = 'edge';
+    enum CrawlMethod {
+      FETCH = 'FETCH',
+      EDGE_BROWSER = 'EDGE_BROWSER',
+      PUPPETEER = 'PUPPETEER',
+      // APIFY = 'APIFY',
+    }
+    const TEMPLATE = `If you don't know how to answer a question, use the available tools to look up relevant information. You should particularly do this for questions about URLs.`;
+    const convertVercelMessageToLangChainMessage = (message: VercelChatMessage) => {
+      if (message.role === 'user') {
+        return new HumanMessage(message.content);
+      } else if (message.role === 'assistant') {
+        return new AIMessage(message.content);
+      } else {
+        return new ChatMessage(message.content, message.role);
+      }
+    };
+    const modelForName = (modelName: string) => {
+      switch (modelName) {
+        case 'gemini-pro':
+          return new ChatGoogleGenerativeAI({
+            modelName,
+            temperature: 0,
+          });
+        case 'gpt-3.5-turbo':
+        case 'gpt-3.5-turbo-16k':
+        case 'gpt-4':
+        case 'gpt-4-1106-preview':
+        default:
+          return new ChatOpenAI({
+            modelName,
+            temperature: 0,
+            streaming: true,
+          });
+      }
+    };
+    const docsForCrawl = async ({
+      origin,
+      urls,
+      crawlMethod,
+    }: {
+      origin: string | null;
+      urls: string[];
+      crawlMethod: CrawlMethod | undefined;
+    }): Promise<Document[]> => {
+      const headers = new Headers({
+        'User-Agent': 'Stackwise/1.0 (https://stackwise.io)',
+      });
+      if (!crawlMethod) {
+        throw new Error('crawlMethod is required');
+      }
+      switch (crawlMethod) {
+        case CrawlMethod.EDGE_BROWSER:
+          return (
+            await (async () => {
+              const loaders = urls
+                .map(
+                  (url) =>
+                    new CheerioWebBaseLoader(url, {
+                      selector: 'body',
+                    }),
+                )
+                .map((loader) => loader.load());
+              return await Promise.all(loaders);
+            })()
+          ).flat();
+        case CrawlMethod.FETCH:
+          return (
+            await Promise.all(
+              urls.map(async (url: string) => {
+                const fetched = await fetch(url, {
+                  headers,
+                });
+                const body = await fetched.text();
+                const _contentType = fetched.headers.get('Content-Type');
+                // TODO: handle content types differently
+                return [new Document({ pageContent: body })];
+              }),
+            )
+          ).flat();
+        case CrawlMethod.PUPPETEER:
+          return (
+            await Promise.all(
+              urls.map(async (url: string) => {
+                const fetched = await fetch(
+                  `${origin}/api/rag-url-with-langchain/${crawlMethod.toLowerCase()}`,
+                  {
+                    method: 'POST',
+                    body: JSON.stringify({ url, html: undefined }),
+                  },
+                );
+                return (await fetched.json()) as Document[];
+              }),
+            )
+          ).flat();
+      }
+    };
+    function normalizeUrl(urlString: string): string {
+      // Regex pattern to match URL-like strings (with optional protocol and path/query)
+      const urlPattern = /(?:https?:\/\/)?[\w-]+(\.[\w-]+)+\.?(\/\S*)?/gi;
+      // Check if the string matches the URL pattern
+      if (urlPattern.test(urlString)) {
+        // Check if the protocol is already present
+        if (!/^https?:\/\//i.test(urlString)) {
+          // Prepend "https://" if the protocol is missing
+          urlString = `https://${urlString}`;
+        }
+      }
+      return urlString;
+    }
+    interface CrawlRequest {
+      messages: Message[];
+      returnIntermediateSteps: boolean;
+      crawlMethod: CrawlMethod;
+      urls: string[];
+      modelName: string;
+    }
+    /**
+     * This handler initializes and calls a retrieval chain. It composes the chain using
+     * LangChain Expression Language. See the docs for more information:
+     *
+     * https://js.langchain.com/docs/guides/expression_language/cookbook#conversational-retrieval-chain
+     */
+    export async function POST(req: NextRequest) {
+      try {
+        const body = (await req.json()) as Partial<CrawlRequest>;
+        const messages = body.messages ?? [];
+        const urls: string[] = (body.urls ?? []).map(normalizeUrl);
+        const crawlMethod: CrawlMethod | undefined = body.crawlMethod;
+        const modelName = body.modelName ?? 'gpt-3.5-turbo';
+        const returnIntermediateSteps: boolean =
+          body.returnIntermediateSteps ?? true;
+        const previousMessages = messages.slice(0, -1);
+        const currentMessageContent = messages[messages.length - 1].content;
+        const model = modelForName(modelName);
+        const origin = req.headers.get('origin');
+        const docs = await docsForCrawl({
+          origin,
+          urls,
+          crawlMethod,
+        });
+        const baseRetriever = await compressedRelevantDocsRetriever(docs);
+        const chat_history = (previousMessages ?? []).map(
+          convertVercelMessageToLangChainMessage,
+        );
+        const chatHistory = new ChatMessageHistory(chat_history);
+        const memory =
+          model instanceof ChatOpenAI
+            ? new OpenAIAgentTokenBufferMemory({
+                llm: model,
+                memoryKey: 'chat_history',
+                outputKey: 'output',
+                chatHistory,
+              })
+            : new BufferMemory({
+                chatHistory,
+                memoryKey: 'chat_history',
+                outputKey: 'output',
+              });
+        const { stream, handlers } = LangChainStream();
+        let resolveWithDocuments: (value: Document[]) => void;
+        const sourcesPromise = new Promise<Document[]>((resolve) => {
+          resolveWithDocuments = resolve;
+        });
+        /**
+         * Wrap the retriever in a tool to present it to the agent in a
+         * usable form.
+         */
+        const tool = createRetrieverTool(baseRetriever, {
+          name: 'search_latest_knowledge',
+          description: 'Searches and returns up-to-date general information.',
+        });
+        const executor = await initializeAgentExecutorWithOptions([tool], model, {
+          agentType:
+            model instanceof ChatOpenAI
+              ? 'openai-functions'
+              : 'structured-chat-zero-shot-react-description',
+          memory,
+          returnIntermediateSteps,
+          agentArgs: {
+            prefix: TEMPLATE,
+          },
+        });
+        void executor.call(
+          {
+            input: currentMessageContent,
+          },
+          [
+            handlers,
+            {
+              handleRetrieverEnd: (docs) => {
+                resolveWithDocuments(docs);
+              },
+              handleChainEnd: () => {
+                resolveWithDocuments([]);
+              },
+            },
+          ],
+        );
+        const documents = await sourcesPromise;
+        const serializedSources = Buffer.from(
+          JSON.stringify(
+            documents.map((doc) => {
+              return {
+                pageContent: doc.pageContent.slice(0, 50) + '...',
+                metadata: doc.metadata,
+              };
+            }),
+          ),
+        ).toString('base64');
+        return new StreamingTextResponse(stream, {
+          headers: {
+            'x-message-index': (previousMessages.length + 1).toString(),
+            'x-sources': serializedSources,
+          },
+        });
+      } catch (e) {
+        return NextResponse.json({ error: e.message }, { status: 500 });
+      }
+    }
+    const compressedRelevantDocsRetriever = async (docs: Document[]) => {
+      const embeddings = new OpenAIEmbeddings();
+      const embeddingsFilter = new EmbeddingsFilter({
+        embeddings,
+        similarityThreshold: 0.66,
+        k: 10,
+      });
+      const textSplitter = new RecursiveCharacterTextSplitter({
+        chunkSize: 200,
+        chunkOverlap: 20,
+      });
+      const compressorPipeline = new DocumentCompressorPipeline({
+        transformers: [textSplitter, embeddingsFilter],
+      });
+      const baseRetriever = (
+        await MemoryVectorStore.fromDocuments(docs, embeddings, {})
+      ).asRetriever();
+      const retriever = new ContextualCompressionRetriever({
+        baseCompressor: compressorPipeline,
+        baseRetriever,
+      });
+      return retriever;
+    };

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Feat/rag url langchain #122

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Feat/rag url langchain #122

Are you sure you want to change the base?

Uh oh!

Feat/rag url langchain #122

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!