Skip to content

Commit

Permalink
Merge pull request langchain-ai#1118 from martinseanhunt/confluence-l…
Browse files Browse the repository at this point in the history
…oader

Confluence loader
  • Loading branch information
nfcampos authored May 4, 2023
2 parents e85d4dd + 4186d81 commit 7ce2730
Show file tree
Hide file tree
Showing 8 changed files with 271 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
sidebar_class_name: node-only
---

# Confluence

:::tip Compatibility
Only available on Node.js.
:::

This covers how to load document objects from pages in a Confluence space.

## Credentials

- You'll need to set up an access token and provide it along with your confluence username in order to authenticate the request
- You'll also need the `space key` for the space containing the pages to load as documents. This can be found in the url when navigating to your space e.g. `https://example.atlassian.net/wiki/spaces/{SPACE_KEY}`
- And you'll need to install `html-to-text` to parse the pages into plain text

```bash npm2yarn
npm install html-to-text
```

## Usage

import CodeBlock from "@theme/CodeBlock";
import Example from "@examples/document_loaders/confluence.ts";

<CodeBlock language="typescript">{Example}</CodeBlock>
20 changes: 20 additions & 0 deletions examples/src/document_loaders/confluence.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { ConfluencePagesLoader } from "langchain/document_loaders/web/confluence";

const username = process.env.CONFLUENCE_USERNAME;
const accessToken = process.env.CONFLUENCE_ACCESS_TOKEN;

if (username && accessToken) {
const loader = new ConfluencePagesLoader({
baseUrl: "https://example.atlassian.net/wiki",
spaceKey: "~EXAMPLE362906de5d343d49dcdbae5dEXAMPLE",
username,
accessToken,
});

const documents = await loader.load();
console.log(documents);
} else {
console.log(
"You must provide a username and access token to run this example."
);
}
3 changes: 3 additions & 0 deletions langchain/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ document_loaders/web/github.d.ts
document_loaders/web/s3.cjs
document_loaders/web/s3.js
document_loaders/web/s3.d.ts
document_loaders/web/confluence.cjs
document_loaders/web/confluence.js
document_loaders/web/confluence.d.ts
document_loaders/fs/directory.cjs
document_loaders/fs/directory.js
document_loaders/fs/directory.d.ts
Expand Down
8 changes: 8 additions & 0 deletions langchain/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@
"document_loaders/web/s3.cjs",
"document_loaders/web/s3.js",
"document_loaders/web/s3.d.ts",
"document_loaders/web/confluence.cjs",
"document_loaders/web/confluence.js",
"document_loaders/web/confluence.d.ts",
"document_loaders/fs/directory.cjs",
"document_loaders/fs/directory.js",
"document_loaders/fs/directory.d.ts",
Expand Down Expand Up @@ -803,6 +806,11 @@
"import": "./document_loaders/web/s3.js",
"require": "./document_loaders/web/s3.cjs"
},
"./document_loaders/web/confluence": {
"types": "./document_loaders/web/confluence.d.ts",
"import": "./document_loaders/web/confluence.js",
"require": "./document_loaders/web/confluence.cjs"
},
"./document_loaders/fs/directory": {
"types": "./document_loaders/fs/directory.d.ts",
"import": "./document_loaders/fs/directory.js",
Expand Down
2 changes: 2 additions & 0 deletions langchain/scripts/create-entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ const entrypoints = {
"document_loaders/web/imsdb": "document_loaders/web/imsdb",
"document_loaders/web/github": "document_loaders/web/github",
"document_loaders/web/s3": "document_loaders/web/s3",
"document_loaders/web/confluence": "document_loaders/web/confluence",
"document_loaders/fs/directory": "document_loaders/fs/directory",
"document_loaders/fs/buffer": "document_loaders/fs/buffer",
"document_loaders/fs/text": "document_loaders/fs/text",
Expand Down Expand Up @@ -168,6 +169,7 @@ const requiresOptionalDependency = [
"document_loaders/web/imsdb",
"document_loaders/web/github",
"document_loaders/web/s3",
"document_loaders/web/confluence",
"document_loaders/fs/directory",
"document_loaders/fs/buffer",
"document_loaders/fs/text",
Expand Down
81 changes: 81 additions & 0 deletions langchain/src/document_loaders/tests/confluence.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import { test, jest, expect } from "@jest/globals";
import {
ConfluencePagesLoader,
ConfluenceAPIResponse,
} from "../web/confluence.js";

type TestConfluencePagesLoaderType = ConfluencePagesLoader & {
fetchConfluenceData: (url: string) => Promise<ConfluenceAPIResponse>;
};

test("Test ConfluenceLoader and fetchConfluenceData calls", async () => {
// Stub the fetchConfluenceData method to return a fake response
// As the Confluence API requires authentication
const fakeResponse = [
{
id: "1",
title: "Page 1",
body: { storage: { value: "<p>Content of Page 1</p>" } },
},
{
id: "2",
title: "Page 2",
body: { storage: { value: "<p>Content of Page 2</p>" } },
},
];

// Initialize the loader and load the documents
const loader = new ConfluencePagesLoader({
baseUrl: "https://example.atlassian.net/wiki",
spaceKey: "SPACEKEY",
username: "[email protected]",
accessToken: "accessToken",
}) as TestConfluencePagesLoaderType;

// Our fetchConfluenceData function is called recursively
// until the size of the response is 0
const fetchConfluenceDataMock = jest
.spyOn(loader, "fetchConfluenceData")
.mockImplementationOnce(() =>
Promise.resolve({ size: 2, results: fakeResponse })
)
.mockImplementationOnce(() =>
Promise.resolve({ size: 2, results: fakeResponse })
)
.mockImplementationOnce(() => Promise.resolve({ size: 0, results: [] }));

const documents = await loader.load();

// Validate the test results
expect(documents.length).toBe(4);
expect(documents[0].metadata.title).toBeDefined();
expect(documents[0].metadata.url).toBeDefined();

// Ensure fetchConfluenceData is called three times
expect(fetchConfluenceDataMock).toHaveBeenCalledTimes(3);

// Ensure the arguments are correct for each call
expect(fetchConfluenceDataMock).toHaveBeenNthCalledWith(
1,
"https://example.atlassian.net/wiki/rest/api/content?spaceKey=SPACEKEY&limit=25&start=0&expand=body.storage"
);
expect(fetchConfluenceDataMock).toHaveBeenNthCalledWith(
2,
"https://example.atlassian.net/wiki/rest/api/content?spaceKey=SPACEKEY&limit=25&start=2&expand=body.storage"
);
expect(fetchConfluenceDataMock).toHaveBeenNthCalledWith(
3,
"https://example.atlassian.net/wiki/rest/api/content?spaceKey=SPACEKEY&limit=25&start=4&expand=body.storage"
);

// Check if the generated URLs in the metadata are correct
expect(documents[0].metadata.url).toBe(
"https://example.atlassian.net/wiki/spaces/SPACEKEY/pages/1"
);
expect(documents[1].metadata.url).toBe(
"https://example.atlassian.net/wiki/spaces/SPACEKEY/pages/2"
);

// Restore the mock to its original behavior
fetchConfluenceDataMock.mockRestore();
});
128 changes: 128 additions & 0 deletions langchain/src/document_loaders/web/confluence.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import { htmlToText } from "html-to-text";
import { Document } from "../../document.js";
import { BaseDocumentLoader } from "../base.js";

export interface ConfluencePagesLoaderParams {
baseUrl: string;
spaceKey: string;
username: string;
accessToken: string;
limit?: number;
}

export interface ConfluencePage {
id: string;
title: string;
body: {
storage: {
value: string;
};
};
}

export interface ConfluenceAPIResponse {
size: number;
results: ConfluencePage[];
}

export class ConfluencePagesLoader extends BaseDocumentLoader {
public readonly baseUrl: string;

public readonly spaceKey: string;

public readonly username: string;

public readonly accessToken: string;

public readonly limit: number;

constructor({
baseUrl,
spaceKey,
username,
accessToken,
limit = 25,
}: ConfluencePagesLoaderParams) {
super();
this.baseUrl = baseUrl;
this.spaceKey = spaceKey;
this.username = username;
this.accessToken = accessToken;
this.limit = limit;
}

public async load(): Promise<Document[]> {
try {
const pages = await this.fetchAllPagesInSpace();
return pages.map((page) => this.createDocumentFromPage(page));
} catch (error) {
console.error("Error:", error);
return [];
}
}

protected async fetchConfluenceData(
url: string
): Promise<ConfluenceAPIResponse> {
try {
const authToken = Buffer.from(
`${this.username}:${this.accessToken}`
).toString("base64");

const response = await fetch(url, {
headers: {
Authorization: `Basic ${authToken}`,
"Content-Type": "application/json",
Accept: "application/json",
},
});

if (!response.ok) {
throw new Error(
`Failed to fetch ${url} from Confluence: ${response.status}`
);
}

return await response.json();
} catch (error) {
throw new Error(`Failed to fetch ${url} from Confluence: ${error}`);
}
}

private async fetchAllPagesInSpace(start = 0): Promise<ConfluencePage[]> {
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${this.limit}&start=${start}&expand=body.storage`;
const data = await this.fetchConfluenceData(url);

if (data.size === 0) {
return [];
}

const nextPageStart = start + data.size;
const nextPageResults = await this.fetchAllPagesInSpace(nextPageStart);

return data.results.concat(nextPageResults);
}

private createDocumentFromPage(page: ConfluencePage): Document {
// Convert the HTML content to plain text
const plainTextContent = htmlToText(page.body.storage.value, {
wordwrap: false,
preserveNewlines: false,
});

// Remove empty lines
const textWithoutEmptyLines = plainTextContent.replace(/^\s*[\r\n]/gm, "");

// Generate the URL
const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`;

// Return a langchain document
return new Document({
pageContent: textWithoutEmptyLines,
metadata: {
title: page.title,
url: pageUrl,
},
});
}
}
1 change: 1 addition & 0 deletions langchain/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
"src/document_loaders/web/imsdb.ts",
"src/document_loaders/web/github.ts",
"src/document_loaders/web/s3.ts",
"src/document_loaders/web/confluence.ts",
"src/document_loaders/fs/directory.ts",
"src/document_loaders/fs/buffer.ts",
"src/document_loaders/fs/text.ts",
Expand Down

0 comments on commit 7ce2730

Please sign in to comment.