forked from langchain-ai/langchainjs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
More text loaders: CSV, JSON, JSONLines (langchain-ai#168)
* Make TextLoader extendable * Add json loaders wip * Finish json/jsonl loaders * Add csv loader * Run prrettier * Fix test file path assertion * Add exports * PDF loader (langchain-ai#169) * Add pdf loader * Fix error message * Add export * Lint * Directory loader, and docs for the other new loaders (langchain-ai#170) * Add directory loader * Add documentation for all new loaders * Lint * Update error message * Better sidebar layout for document loaders * Fix test paths * Fix test on windows
- Loading branch information
Showing
35 changed files
with
676 additions
and
56 deletions.
There are no files selected for viewing
3 changes: 3 additions & 0 deletions
3
docs/docs/modules/document_loaders/file_loaders/_category_.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
label: "File Loaders" | ||
collapsible: false # make the category collapsible | ||
collapsed: false # keep the category open by default |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# CSV files | ||
|
||
This example goes over how to load data from CSV files. The second argument is the column name to extract from the CSV file. One document will be created for each row in the CSV file. | ||
|
||
Example CSV file: | ||
|
||
```csv | ||
id,text | ||
1,This is a sentence. | ||
2,This is another sentence. | ||
``` | ||
|
||
Example code: | ||
|
||
```typescript | ||
import { CSVLoader } from "langchain/document_loaders"; | ||
|
||
const loader = new CSVLoader( | ||
"src/document_loaders/example_data/example.csv", | ||
"text" | ||
); | ||
const docs = await loader.load(); | ||
console.log({ docs }); | ||
``` |
41 changes: 41 additions & 0 deletions
41
docs/docs/modules/document_loaders/file_loaders/directory.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
--- | ||
sidebar_position: 1 | ||
--- | ||
|
||
# Folders with multiple files | ||
|
||
This example goes over how to load data from folders with multiple files. The second argument is a map of file extensions to loader factories. Each file will be passed to the matching loader, and the resulting documents will be concatenated together. | ||
|
||
Example folder: | ||
|
||
```text | ||
src/document_loaders/example_data/example/ | ||
├── example.json | ||
├── example.jsonl | ||
├── example.txt | ||
└── example.csv | ||
``` | ||
|
||
Example code: | ||
|
||
```typescript | ||
import { | ||
DirectoryLoader, | ||
JSONLoader, | ||
JSONLinesLoader, | ||
TextLoader, | ||
CSVLoader, | ||
} from "langchain/document_loaders"; | ||
|
||
const loader = new DirectoryLoader( | ||
"src/document_loaders/example_data/example", | ||
{ | ||
".json": (path) => new JSONLoader(path, "/texts"), | ||
".jsonl": (path) => new JSONLinesLoader(path, "/html"), | ||
".txt": (path) => new TextLoader(path), | ||
".csv": (path) => new CSVLoader(path, "text"), | ||
} | ||
); | ||
const docs = await loader.load(); | ||
console.log({ docs }); | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# JSON files | ||
|
||
This example goes over how to load data from JSON files. The second argument is a JSONPointer to the array of strings to extract from the JSON file. One document will be created for each string in the array. You can omit the second argument to load a JSON file containing an array of strings. | ||
|
||
Example JSON file: | ||
|
||
```json | ||
{ | ||
"texts": ["This is a sentence.", "This is another sentence."] | ||
} | ||
``` | ||
|
||
Example code: | ||
|
||
```typescript | ||
import { JSONLoader } from "langchain/document_loaders"; | ||
|
||
const loader = new JSONLoader( | ||
"src/document_loaders/example_data/example.json", | ||
"/texts" | ||
); | ||
const docs = await loader.load(); | ||
console.log({ docs }); | ||
``` |
23 changes: 23 additions & 0 deletions
23
docs/docs/modules/document_loaders/file_loaders/jsonlines.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# JSONLines files | ||
|
||
This example goes over how to load data from JSONLines or JSONL files. The second argument is a JSONPointer to the property to extract from each JSON object in the file. One document will be created for each JSON object in the file. | ||
|
||
Example JSONLines file: | ||
|
||
```json | ||
{"html": "This is a sentence."} | ||
{"html": "This is another sentence."} | ||
``` | ||
|
||
Example code: | ||
|
||
```typescript | ||
import { JSONLinesLoader } from "langchain/document_loaders"; | ||
|
||
const loader = new JSONLinesLoader( | ||
"src/document_loaders/example_data/example.jsonl", | ||
"/html" | ||
); | ||
const docs = await loader.load(); | ||
console.log({ docs }); | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# PDF files | ||
|
||
This example goes over how to load data from PDF files. | ||
|
||
```typescript | ||
import { PDFLoader } from "langchain/document_loaders"; | ||
|
||
const loader = new PDFLoader("src/document_loaders/example_data/example.pdf"); | ||
const docs = await loader.load(); | ||
console.log({ docs }); | ||
``` |
File renamed without changes.
File renamed without changes.
3 changes: 3 additions & 0 deletions
3
docs/docs/modules/document_loaders/web_loaders/_category_.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
label: "Web Loaders" | ||
collapsible: false # make the category collapsible | ||
collapsed: false # keep the category open by default |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
4 changes: 4 additions & 0 deletions
4
...ocs/modules/document_loaders/web_pages.md → ...document_loaders/web_loaders/web_pages.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,7 @@ | ||
--- | ||
sidebar_position: 1 | ||
--- | ||
|
||
# Webpages | ||
|
||
This example goes over how to load data from webpages. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import { csvParse } from "d3-dsv"; | ||
|
||
import { TextLoader } from "./text.js"; | ||
|
||
export class CSVLoader extends TextLoader { | ||
constructor(filePath: string, public column: string) { | ||
super(filePath); | ||
} | ||
|
||
protected async parse(raw: string): Promise<string[]> { | ||
const parsed = csvParse(raw.trim()); | ||
if (!parsed.columns.includes(this.column)) { | ||
throw new Error(`Column ${this.column} not found in CSV file.`); | ||
} | ||
// Note TextLoader will raise an exception if the value is null. | ||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion | ||
return parsed.map((row) => row[this.column]!); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import type { extname as ExtnameT, resolve as ResolveT } from "path"; | ||
import type { readdir as ReaddirT } from "fs/promises"; | ||
import { Document } from "../document.js"; | ||
import { getEnv } from "../util/env.js"; | ||
import { BaseDocumentLoader } from "./base.js"; | ||
|
||
export enum UnknownHandling { | ||
Ignore = "ignore", | ||
Warn = "warn", | ||
Error = "error", | ||
} | ||
|
||
export class DirectoryLoader extends BaseDocumentLoader { | ||
constructor( | ||
public directoryPath: string, | ||
public loaders: { | ||
[extension: string]: (filePath: string) => BaseDocumentLoader; | ||
}, | ||
public recursive: boolean = true, | ||
public unknown: UnknownHandling = UnknownHandling.Warn | ||
) { | ||
super(); | ||
|
||
if (Object.keys(loaders).length === 0) { | ||
throw new Error("Must provide at least one loader"); | ||
} | ||
for (const extension in loaders) { | ||
if (Object.hasOwn(loaders, extension)) { | ||
if (extension[0] !== ".") { | ||
throw new Error(`Extension must start with a dot: ${extension}`); | ||
} | ||
} | ||
} | ||
} | ||
|
||
public async load(): Promise<Document[]> { | ||
const { readdir, extname, resolve } = await DirectoryLoader.imports(); | ||
const files = await readdir(this.directoryPath, { withFileTypes: true }); | ||
|
||
const documents = []; | ||
|
||
for (const file of files) { | ||
const fullPath = resolve(this.directoryPath, file.name); | ||
if (file.isDirectory()) { | ||
if (this.recursive) { | ||
const loader = new DirectoryLoader( | ||
fullPath, | ||
this.loaders, | ||
this.recursive, | ||
this.unknown | ||
); | ||
documents.push(...(await loader.load())); | ||
} | ||
} else { | ||
// I'm aware some things won't be files, | ||
// but they will be caught by the "unknown" handling below. | ||
const loaderFactory = this.loaders[extname(file.name)]; | ||
if (loaderFactory) { | ||
const loader = loaderFactory(fullPath); | ||
documents.push(...(await loader.load())); | ||
} else { | ||
switch (this.unknown) { | ||
case UnknownHandling.Ignore: | ||
break; | ||
case UnknownHandling.Warn: | ||
console.warn(`Unknown file type: ${file.name}`); | ||
break; | ||
case UnknownHandling.Error: | ||
throw new Error(`Unknown file type: ${file.name}`); | ||
default: | ||
throw new Error(`Unknown unknown handling: ${this.unknown}`); | ||
} | ||
} | ||
} | ||
} | ||
|
||
return documents; | ||
} | ||
|
||
static async imports(): Promise<{ | ||
readdir: typeof ReaddirT; | ||
extname: typeof ExtnameT; | ||
resolve: typeof ResolveT; | ||
}> { | ||
try { | ||
const { extname, resolve } = await import("path"); | ||
const { readdir } = await import("fs/promises"); | ||
return { readdir, extname, resolve }; | ||
} catch (e) { | ||
console.error(e); | ||
throw new Error( | ||
`Failed to load fs/promises. DirectoryLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.` | ||
); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import jsonpointer from "jsonpointer"; | ||
|
||
import { TextLoader } from "./text.js"; | ||
|
||
export class JSONLoader extends TextLoader { | ||
constructor(filePath: string, public pointer: string = "") { | ||
super(filePath); | ||
} | ||
|
||
protected async parse(raw: string): Promise<string[]> { | ||
const json = JSON.parse(raw.trim()); | ||
const pointer = jsonpointer.compile(this.pointer); | ||
const value = pointer.get(json); | ||
return Array.isArray(value) ? value : [value]; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import jsonpointer from "jsonpointer"; | ||
|
||
import { TextLoader } from "./text.js"; | ||
|
||
export class JSONLinesLoader extends TextLoader { | ||
constructor(filePath: string, public pointer: string) { | ||
super(filePath); | ||
} | ||
|
||
protected async parse(raw: string): Promise<string[]> { | ||
const lines = raw.split("\n"); | ||
const jsons = lines | ||
.map((line) => line.trim()) | ||
.filter(Boolean) | ||
.map((line) => JSON.parse(line)); | ||
const pointer = jsonpointer.compile(this.pointer); | ||
return jsons.map((json) => pointer.get(json)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import type { readFile as ReadFileT } from "fs/promises"; | ||
// the main entrypoint has some debug code that we don't want to import | ||
import pdf from "pdf-parse/lib/pdf-parse.js"; | ||
import { Document } from "../document.js"; | ||
import { getEnv } from "../util/env.js"; | ||
import { BaseDocumentLoader } from "./base.js"; | ||
|
||
export class PDFLoader extends BaseDocumentLoader { | ||
constructor(public filePath: string) { | ||
super(); | ||
} | ||
|
||
public async load(): Promise<Document[]> { | ||
const { readFile } = await PDFLoader.imports(); | ||
const buffer = await readFile(this.filePath); | ||
const parsed = await pdf(buffer); | ||
const metadata = { source: this.filePath }; | ||
return [new Document({ pageContent: parsed.text, metadata })]; | ||
} | ||
|
||
static async imports(): Promise<{ | ||
readFile: typeof ReadFileT; | ||
}> { | ||
try { | ||
const { readFile } = await import("fs/promises"); | ||
return { readFile }; | ||
} catch (e) { | ||
console.error(e); | ||
throw new Error( | ||
`Failed to load fs/promises. PDFLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.` | ||
); | ||
} | ||
} | ||
} |
Oops, something went wrong.