Skip to content

Commit

Permalink
fix(cheerio): Move selector to constructor
Browse files Browse the repository at this point in the history
  • Loading branch information
magick93 committed Apr 11, 2023
1 parent f9f8855 commit 287cdd1
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 6 deletions.
11 changes: 7 additions & 4 deletions langchain/src/document_loaders/cheerio_web_base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@ export class CheerioWebBaseLoader

caller: AsyncCaller;

constructor(public webPath: string, fields?: WebBaseLoaderParams) {
super();
selector?: SelectorType;

constructor(public webPath: string, fields?: WebBaseLoaderParams, selector?: SelectorType) {
super();
const { timeout, ...rest } = fields ?? {};
this.timeout = timeout ?? 10000;
this.caller = new AsyncCaller(rest);
this.selector = selector;
}

static async _scrape(
Expand All @@ -47,9 +50,9 @@ export class CheerioWebBaseLoader
);
}

async load(selector?: SelectorType): Promise<Document[]> {
async load(): Promise<Document[]> {
const $ = await this.scrape();
const text = $(selector).text() ?? $("body").text();
const text = $(this.selector ?? "body").text();
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}
Expand Down
5 changes: 3 additions & 2 deletions langchain/src/document_loaders/tests/cheerio_web.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ test("Test cheerio web scraper loader", async () => {
});

test("Test cheerio web scraper loader with selector", async () => {
const loader = new CheerioWebBaseLoader("https://about.google/commitments/");
const selectH1 = "h1";
const doc = await loader.load(selectH1);
const loader = new CheerioWebBaseLoader("https://about.google/commitments/", {}, selectH1);

const doc = await loader.load();
expect(doc[0].pageContent.trim()).toBe(
"Committed to significantly improving the lives of as many people as possible."
);
Expand Down

0 comments on commit 287cdd1

Please sign in to comment.