Skip to content

Commit

Permalink
add tests for document loaders (langchain-ai#80)
Browse files Browse the repository at this point in the history
* cr

* cr

* cr

* cr

* cr

* cr

* cr

* Update after move to ESM

---------

Co-authored-by: Nuno Campos <[email protected]>
  • Loading branch information
hwchase17 and nfcampos authored Feb 27, 2023
1 parent 91fe9ac commit 7114a07
Show file tree
Hide file tree
Showing 12 changed files with 1,487 additions and 47 deletions.
2 changes: 1 addition & 1 deletion langchain/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -222,4 +222,4 @@
"import": "./document_loaders.js"
}
}
}
}
39 changes: 15 additions & 24 deletions langchain/src/document_loaders/cheerio_web_base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,16 @@ import { Document } from "../document.js";
import { BaseDocumentLoader } from "./base.js";
import type { DocumentLoader } from "./base.js";

let load: typeof LoadT | null = null;

try {
// eslint-disable-next-line global-require,import/no-extraneous-dependencies
({ load } = require("cheerio"));
} catch {
// ignore error, will be throw in constructor
}

export class CheerioWebBaseLoader
extends BaseDocumentLoader
implements DocumentLoader
{
constructor(public webPath: string) {
super();

/**
* Throw error at construction time
* if cheerio package is not installed.
*/
if (load === null) {
throw new Error(
"Please install cheerio as a dependency with, e.g. `yarn add cheerio`"
);
}
}

static async _scrape(url: string): Promise<CheerioAPI> {
if (load === null) {
throw new Error(
"Please install cheerio as a dependency with, e.g. `yarn add cheerio`"
);
}
const { load } = await CheerioWebBaseLoader.imports();
const response = await fetch(url);
const html = await response.text();
return load(html);
Expand All @@ -51,4 +28,18 @@ export class CheerioWebBaseLoader
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}

static async imports(): Promise<{
load: typeof LoadT;
}> {
try {
const { load } = await import("cheerio");
return { load };
} catch (e) {
console.error(e);
throw new Error(
"Please install cheerio as a dependency with, e.g. `yarn add cheerio`"
);
}
}
}
2 changes: 1 addition & 1 deletion langchain/src/document_loaders/srt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ export class SRTLoader extends BaseDocumentLoader {
isJsDom,
isDeno,
// eslint-disable-next-line global-require,@typescript-eslint/no-var-requires
} = require("browser-or-node");
} = await import("browser-or-node");
let env: string;
if (isBrowser) {
env = "browser";
Expand Down
9 changes: 9 additions & 0 deletions langchain/src/document_loaders/tests/cheerio_web.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { test } from "@jest/globals";
import { CheerioWebBaseLoader } from "../cheerio_web_base.js";

test("Test cheerio web scraper loader", async () => {
const loader = new CheerioWebBaseLoader(
"https://news.ycombinator.com/item?id=34817881"
);
await loader.load();
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { test } from "@jest/globals";
import { CollegeConfidentialLoader } from "../college_confidential.js";

test("Test College confidential loader", async () => {
const loader = new CollegeConfidentialLoader(
"https://www.collegeconfidential.com/colleges/brown-university/"
);
await loader.load();
}, 10000);
Loading

0 comments on commit 7114a07

Please sign in to comment.