Skip to content

Commit

Permalink
fix(algolia): chunk and validate size of data (GoogleChrome#220)
Browse files Browse the repository at this point in the history
* fix(algolia): chunk and validate size of data

* refactor(algolia): truncate post content instead of skipping it

* refactor(algolia): small tweaks to indexing script

* refactor(algolia): give all objects uuid and date when indexed so we can update them and remove old pages easily
  • Loading branch information
MichaelSolati authored Nov 6, 2020
1 parent ba97423 commit 4d0c72c
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 4 deletions.
65 changes: 62 additions & 3 deletions algolia.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,40 @@
*/
require('dotenv').config();
const algoliasearch = require('algoliasearch');

const fs = require('fs');
const {sizeof} = require('sizeof');

const maxChunkSizeInBytes = 10000000; // 10,000,000

/**
* Chunks array of AlgoliaCollectionItem into array of array of AlgoliaCollectionItem smaller than 10 MB.
*
* @param {AlgoliaCollectionItem[]} arr
* @return {AlgoliaCollectionItem[][]}
*/
const chunkAlgolia = arr => {
const chunked = [];
let tempSizeInBytes = 0;
let temp = [];
for (const current of arr) {
const currentSizeInBytes = sizeof(current);
if (tempSizeInBytes + currentSizeInBytes < maxChunkSizeInBytes) {
temp.push(current);
tempSizeInBytes += currentSizeInBytes;
} else {
chunked.push(temp);
temp = [current];
tempSizeInBytes = currentSizeInBytes;
}
}
chunked.push(temp);
return chunked;
};

async function index() {
const indexedOn = new Date();

if (!process.env.ALGOLIA_APP_ID || !process.env.ALGOLIA_API_KEY) {
console.warn('Missing Algolia environment variables, skipping indexing.');
return;
Expand All @@ -26,17 +57,45 @@ async function index() {
const raw = fs.readFileSync('dist/algolia.json', 'utf-8');
const algoliaData = JSON.parse(raw);

// Set date of when object is being added to algolia
algoliaData.map(e => {
e.indexedOn = indexedOn.getTime();
return e;
});

const chunkedAlgoliaData = chunkAlgolia(algoliaData);
const postsCount = algoliaData.length;

const client = algoliasearch(
process.env.ALGOLIA_APP_ID,
process.env.ALGOLIA_API_KEY
);
const index = client.initIndex('prod_developer_chrome');

console.log(`Indexing ${algoliaData.length} articles`);
console.log(
`Indexing ${postsCount} articles amongst ${chunkedAlgoliaData.length} chunk(s).`
);

// When indexing data we mark these two fields as fields that can be filtered by.
await index.setSettings({
attributesForFaceting: ['locale', 'tags'],
});

// Update algolia index with new data
for (let i = 0; i < chunkedAlgoliaData.length; i++) {
await index.saveObjects(chunkedAlgoliaData[i], {
autoGenerateObjectIDIfNotExist: true,
});
}

console.log('Updated algolia data.');

await index.replaceAllObjects(algoliaData, {
autoGenerateObjectIDIfNotExist: true,
console.log('Deleting old data no longer in algolia.json.');
await index.deleteBy({
filters: `indexedOn < ${indexedOn.getTime()}`,
});
console.log('Deleted old data.');

console.log('Done!');
}

Expand Down
12 changes: 12 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"ava": "^3.12.1",
"chalk": "^4.1.0",
"cheerio": "^1.0.0-rc.3",
"crypto": "^1.0.1",
"csso": "^4.0.3",
"dotenv": "^8.2.0",
"eslint-plugin-ava": "^11.0.0",
Expand Down Expand Up @@ -103,6 +104,7 @@
"rimraf": "^3.0.2",
"rollup-plugin-copy": "^3.3.0",
"rollup-plugin-terser": "^7.0.2",
"sizeof": "^1.0.0",
"stylelint": "^13.7.0",
"stylelint-config-sass-guidelines": "^7.1.0",
"typescript": "^3.8.3",
Expand Down
26 changes: 25 additions & 1 deletion site/_collections/algolia.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,31 @@
*/

const removeMarkdown = require('remove-markdown');
const {createHash} = require('crypto');

const {generateSrc} = require('../_shortcodes/img');

/**
* Shrink the size of the given fulltext to fit within a certain limit, at the
* nearest found newline character.
*
* @param {string} content
* @param {number} [limit]
* @return {string}
*/
function limitText(content, limit = 7500) {
if (content.length <= limit) {
return content;
}

// Find the nearest prior newline to the 10k limit.
let newlineIndex = content.lastIndexOf('\n', limit);
if (newlineIndex === -1) {
newlineIndex = limit;
}
return content.slice(0, newlineIndex);
}

/**
* @param {EleventyCollectionObject} collections
* @returns {AlgoliaCollectionItem[]}
Expand All @@ -38,11 +61,12 @@ module.exports = collections => {
algoliaCollectionItems.push({
title: item.data.title,
description: item.data.description,
content: removeMarkdown(item.template.frontMatter.content),
content: limitText(removeMarkdown(item.template.frontMatter.content)),
url: item.url,
tags: item.data.tags || [],
locale: item.data.locale,
photo: item.data.hero && generateSrc(item.data.hero),
objectID: createHash('md5').update(item.url).digest('hex'),
});
}
return algoliaCollectionItems;
Expand Down
1 change: 1 addition & 0 deletions types/site/_collections/algolia.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ declare global {
tags: string[];
locale: string;
photo?: string;
objectID: string;
}
}

Expand Down

0 comments on commit 4d0c72c

Please sign in to comment.