Skip to content

Commit

Permalink
Merge pull request langchain-ai#772 from hwchase17/nc/cheerio-selecto…
Browse files Browse the repository at this point in the history
…r-merge

Nc/cheerio selector merge
  • Loading branch information
nfcampos authored Apr 13, 2023
2 parents 64a59b6 + b0185f9 commit ca9cdee
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,18 @@ const loader = new CheerioWebBaseLoader(

const docs = await loader.load();
```

## Usage, with a custom selector

```typescript
import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio";

const loader = new CheerioWebBaseLoader(
"https://news.ycombinator.com/item?id=34817881",
{
selector: "p.athing",
}
);

const docs = await loader.load();
```
14 changes: 13 additions & 1 deletion langchain/src/document_loaders/tests/cheerio.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { test } from "@jest/globals";
import { expect, test } from "@jest/globals";
import { CheerioWebBaseLoader } from "../web/cheerio.js";

test("Test cheerio web scraper loader", async () => {
Expand All @@ -7,3 +7,15 @@ test("Test cheerio web scraper loader", async () => {
);
await loader.load();
});

test("Test cheerio web scraper loader with selector", async () => {
const selectH1 = "h1";
const loader = new CheerioWebBaseLoader("https://about.google/commitments/", {
selector: selectH1,
});

const doc = await loader.load();
expect(doc[0].pageContent.trim()).toBe(
"Committed to significantly improving the lives of as many people as possible."
);
});
15 changes: 12 additions & 3 deletions langchain/src/document_loaders/web/cheerio.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { CheerioAPI, load as LoadT } from "cheerio";
import type { CheerioAPI, load as LoadT, SelectorType } from "cheerio";
import { Document } from "../../document.js";
import { BaseDocumentLoader } from "../base.js";
import type { DocumentLoader } from "../base.js";
Expand All @@ -9,6 +9,12 @@ export interface WebBaseLoaderParams extends AsyncCallerParams {
* The timeout in milliseconds for the fetch request. Defaults to 10s.
*/
timeout?: number;

/**
* The selector to use to extract the text from the document. Defaults to
* "body".
*/
selector?: SelectorType;
}

export class CheerioWebBaseLoader
Expand All @@ -19,11 +25,14 @@ export class CheerioWebBaseLoader

caller: AsyncCaller;

selector?: SelectorType;

constructor(public webPath: string, fields?: WebBaseLoaderParams) {
super();
const { timeout, ...rest } = fields ?? {};
const { timeout, selector, ...rest } = fields ?? {};
this.timeout = timeout ?? 10000;
this.caller = new AsyncCaller(rest);
this.selector = selector ?? "body";
}

static async _scrape(
Expand All @@ -49,7 +58,7 @@ export class CheerioWebBaseLoader

async load(): Promise<Document[]> {
const $ = await this.scrape();
const text = $("body").text();
const text = $(this.selector).text();
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}
Expand Down

0 comments on commit ca9cdee

Please sign in to comment.