forked from langchain-ai/langchainjs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request langchain-ai#1118 from martinseanhunt/confluence-l…
…oader Confluence loader
- Loading branch information
Showing
8 changed files
with
271 additions
and
0 deletions.
There are no files selected for viewing
28 changes: 28 additions & 0 deletions
28
docs/docs/modules/indexes/document_loaders/examples/web_loaders/confluence.mdx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
--- | ||
sidebar_class_name: node-only | ||
--- | ||
|
||
# Confluence | ||
|
||
:::tip Compatibility | ||
Only available on Node.js. | ||
::: | ||
|
||
This covers how to load document objects from pages in a Confluence space. | ||
|
||
## Credentials | ||
|
||
- You'll need to set up an access token and provide it along with your confluence username in order to authenticate the request | ||
- You'll also need the `space key` for the space containing the pages to load as documents. This can be found in the url when navigating to your space e.g. `https://example.atlassian.net/wiki/spaces/{SPACE_KEY}` | ||
- And you'll need to install `html-to-text` to parse the pages into plain text | ||
|
||
```bash npm2yarn | ||
npm install html-to-text | ||
``` | ||
|
||
## Usage | ||
|
||
import CodeBlock from "@theme/CodeBlock"; | ||
import Example from "@examples/document_loaders/confluence.ts"; | ||
|
||
<CodeBlock language="typescript">{Example}</CodeBlock> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import { ConfluencePagesLoader } from "langchain/document_loaders/web/confluence"; | ||
|
||
const username = process.env.CONFLUENCE_USERNAME; | ||
const accessToken = process.env.CONFLUENCE_ACCESS_TOKEN; | ||
|
||
if (username && accessToken) { | ||
const loader = new ConfluencePagesLoader({ | ||
baseUrl: "https://example.atlassian.net/wiki", | ||
spaceKey: "~EXAMPLE362906de5d343d49dcdbae5dEXAMPLE", | ||
username, | ||
accessToken, | ||
}); | ||
|
||
const documents = await loader.load(); | ||
console.log(documents); | ||
} else { | ||
console.log( | ||
"You must provide a username and access token to run this example." | ||
); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import { test, jest, expect } from "@jest/globals"; | ||
import { | ||
ConfluencePagesLoader, | ||
ConfluenceAPIResponse, | ||
} from "../web/confluence.js"; | ||
|
||
type TestConfluencePagesLoaderType = ConfluencePagesLoader & { | ||
fetchConfluenceData: (url: string) => Promise<ConfluenceAPIResponse>; | ||
}; | ||
|
||
test("Test ConfluenceLoader and fetchConfluenceData calls", async () => { | ||
// Stub the fetchConfluenceData method to return a fake response | ||
// As the Confluence API requires authentication | ||
const fakeResponse = [ | ||
{ | ||
id: "1", | ||
title: "Page 1", | ||
body: { storage: { value: "<p>Content of Page 1</p>" } }, | ||
}, | ||
{ | ||
id: "2", | ||
title: "Page 2", | ||
body: { storage: { value: "<p>Content of Page 2</p>" } }, | ||
}, | ||
]; | ||
|
||
// Initialize the loader and load the documents | ||
const loader = new ConfluencePagesLoader({ | ||
baseUrl: "https://example.atlassian.net/wiki", | ||
spaceKey: "SPACEKEY", | ||
username: "[email protected]", | ||
accessToken: "accessToken", | ||
}) as TestConfluencePagesLoaderType; | ||
|
||
// Our fetchConfluenceData function is called recursively | ||
// until the size of the response is 0 | ||
const fetchConfluenceDataMock = jest | ||
.spyOn(loader, "fetchConfluenceData") | ||
.mockImplementationOnce(() => | ||
Promise.resolve({ size: 2, results: fakeResponse }) | ||
) | ||
.mockImplementationOnce(() => | ||
Promise.resolve({ size: 2, results: fakeResponse }) | ||
) | ||
.mockImplementationOnce(() => Promise.resolve({ size: 0, results: [] })); | ||
|
||
const documents = await loader.load(); | ||
|
||
// Validate the test results | ||
expect(documents.length).toBe(4); | ||
expect(documents[0].metadata.title).toBeDefined(); | ||
expect(documents[0].metadata.url).toBeDefined(); | ||
|
||
// Ensure fetchConfluenceData is called three times | ||
expect(fetchConfluenceDataMock).toHaveBeenCalledTimes(3); | ||
|
||
// Ensure the arguments are correct for each call | ||
expect(fetchConfluenceDataMock).toHaveBeenNthCalledWith( | ||
1, | ||
"https://example.atlassian.net/wiki/rest/api/content?spaceKey=SPACEKEY&limit=25&start=0&expand=body.storage" | ||
); | ||
expect(fetchConfluenceDataMock).toHaveBeenNthCalledWith( | ||
2, | ||
"https://example.atlassian.net/wiki/rest/api/content?spaceKey=SPACEKEY&limit=25&start=2&expand=body.storage" | ||
); | ||
expect(fetchConfluenceDataMock).toHaveBeenNthCalledWith( | ||
3, | ||
"https://example.atlassian.net/wiki/rest/api/content?spaceKey=SPACEKEY&limit=25&start=4&expand=body.storage" | ||
); | ||
|
||
// Check if the generated URLs in the metadata are correct | ||
expect(documents[0].metadata.url).toBe( | ||
"https://example.atlassian.net/wiki/spaces/SPACEKEY/pages/1" | ||
); | ||
expect(documents[1].metadata.url).toBe( | ||
"https://example.atlassian.net/wiki/spaces/SPACEKEY/pages/2" | ||
); | ||
|
||
// Restore the mock to its original behavior | ||
fetchConfluenceDataMock.mockRestore(); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import { htmlToText } from "html-to-text"; | ||
import { Document } from "../../document.js"; | ||
import { BaseDocumentLoader } from "../base.js"; | ||
|
||
export interface ConfluencePagesLoaderParams { | ||
baseUrl: string; | ||
spaceKey: string; | ||
username: string; | ||
accessToken: string; | ||
limit?: number; | ||
} | ||
|
||
export interface ConfluencePage { | ||
id: string; | ||
title: string; | ||
body: { | ||
storage: { | ||
value: string; | ||
}; | ||
}; | ||
} | ||
|
||
export interface ConfluenceAPIResponse { | ||
size: number; | ||
results: ConfluencePage[]; | ||
} | ||
|
||
export class ConfluencePagesLoader extends BaseDocumentLoader { | ||
public readonly baseUrl: string; | ||
|
||
public readonly spaceKey: string; | ||
|
||
public readonly username: string; | ||
|
||
public readonly accessToken: string; | ||
|
||
public readonly limit: number; | ||
|
||
constructor({ | ||
baseUrl, | ||
spaceKey, | ||
username, | ||
accessToken, | ||
limit = 25, | ||
}: ConfluencePagesLoaderParams) { | ||
super(); | ||
this.baseUrl = baseUrl; | ||
this.spaceKey = spaceKey; | ||
this.username = username; | ||
this.accessToken = accessToken; | ||
this.limit = limit; | ||
} | ||
|
||
public async load(): Promise<Document[]> { | ||
try { | ||
const pages = await this.fetchAllPagesInSpace(); | ||
return pages.map((page) => this.createDocumentFromPage(page)); | ||
} catch (error) { | ||
console.error("Error:", error); | ||
return []; | ||
} | ||
} | ||
|
||
protected async fetchConfluenceData( | ||
url: string | ||
): Promise<ConfluenceAPIResponse> { | ||
try { | ||
const authToken = Buffer.from( | ||
`${this.username}:${this.accessToken}` | ||
).toString("base64"); | ||
|
||
const response = await fetch(url, { | ||
headers: { | ||
Authorization: `Basic ${authToken}`, | ||
"Content-Type": "application/json", | ||
Accept: "application/json", | ||
}, | ||
}); | ||
|
||
if (!response.ok) { | ||
throw new Error( | ||
`Failed to fetch ${url} from Confluence: ${response.status}` | ||
); | ||
} | ||
|
||
return await response.json(); | ||
} catch (error) { | ||
throw new Error(`Failed to fetch ${url} from Confluence: ${error}`); | ||
} | ||
} | ||
|
||
private async fetchAllPagesInSpace(start = 0): Promise<ConfluencePage[]> { | ||
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${this.limit}&start=${start}&expand=body.storage`; | ||
const data = await this.fetchConfluenceData(url); | ||
|
||
if (data.size === 0) { | ||
return []; | ||
} | ||
|
||
const nextPageStart = start + data.size; | ||
const nextPageResults = await this.fetchAllPagesInSpace(nextPageStart); | ||
|
||
return data.results.concat(nextPageResults); | ||
} | ||
|
||
private createDocumentFromPage(page: ConfluencePage): Document { | ||
// Convert the HTML content to plain text | ||
const plainTextContent = htmlToText(page.body.storage.value, { | ||
wordwrap: false, | ||
preserveNewlines: false, | ||
}); | ||
|
||
// Remove empty lines | ||
const textWithoutEmptyLines = plainTextContent.replace(/^\s*[\r\n]/gm, ""); | ||
|
||
// Generate the URL | ||
const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`; | ||
|
||
// Return a langchain document | ||
return new Document({ | ||
pageContent: textWithoutEmptyLines, | ||
metadata: { | ||
title: page.title, | ||
url: pageUrl, | ||
}, | ||
}); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters