Skip to content

Commit

Permalink
fix merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
magick93 committed Apr 12, 2023
1 parent 595d21f commit 8051a09
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 4 deletions.
12 changes: 11 additions & 1 deletion langchain/src/document_loaders/tests/cheerio.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { test } from "@jest/globals";
import { expect, test } from "@jest/globals";
import { CheerioWebBaseLoader } from "../web/cheerio.js";

test("Test cheerio web scraper loader", async () => {
Expand All @@ -7,3 +7,13 @@ test("Test cheerio web scraper loader", async () => {
);
await loader.load();
});

test("Test cheerio web scraper loader with selector", async () => {
const selectH1 = "h1";
const loader = new CheerioWebBaseLoader("https://about.google/commitments/", {}, selectH1);

const doc = await loader.load();
expect(doc[0].pageContent.trim()).toBe(
"Committed to significantly improving the lives of as many people as possible."
);
});
9 changes: 6 additions & 3 deletions langchain/src/document_loaders/web/cheerio.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { CheerioAPI, load as LoadT } from "cheerio";
import type { CheerioAPI, load as LoadT, SelectorType } from "cheerio";
import { Document } from "../document.js";
import { BaseDocumentLoader } from "./base.js";
import type { DocumentLoader } from "./base.js";
Expand All @@ -19,11 +19,14 @@ export class CheerioWebBaseLoader

caller: AsyncCaller;

constructor(public webPath: string, fields?: WebBaseLoaderParams) {
selector?: SelectorType;

constructor(public webPath: string, fields?: WebBaseLoaderParams, selector?: SelectorType) {
super();
const { timeout, ...rest } = fields ?? {};
this.timeout = timeout ?? 10000;
this.caller = new AsyncCaller(rest);
this.selector = selector;
}

static async _scrape(
Expand All @@ -49,7 +52,7 @@ export class CheerioWebBaseLoader

async load(): Promise<Document[]> {
const $ = await this.scrape();
const text = $("body").text();
const text = $(this.selector ?? "body").text();
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}
Expand Down

0 comments on commit 8051a09

Please sign in to comment.