generated from aliceisjustplaying/ts-starter-repo
-
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8800427
commit 40f8b1a
Showing
1 changed file
with
85 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,62 +1,114 @@ | ||
import fs from 'fs'; | ||
import fs from 'node:fs'; | ||
|
||
import { codePointToEmoji, emojiToCodePoint, lowercaseObject } from './helpers.js'; | ||
import { Emoji, EmojiVariationSequence } from './types.js'; | ||
|
||
// Load and parse normalization data | ||
// converted from: https://unicode.org/Public/emoji/12.1/emoji-variation-sequences.txt | ||
// regex in Sublime Text form: | ||
// find: ([0-9A-F]{4,5}) +FE0E +; +.+? style; +\# \((\d.\d)\) ([A-Z0-9\- ]+)\n[0-9A-F]{4,5} +FE0F +; +.+? style; +\# \(\d.\d\) [A-Z0-9\- ]+\n | ||
// replace: {"code": "$1", "textStyle": "$1 FE0E", "emojiStyle": "$1 FE0F", "version": "$2", "name": "$3"},\n | ||
// Converted from: https://unicode.org/Public/emoji/12.1/emoji-variation-sequences.txt | ||
// Regex in Sublime Text form: | ||
// Find: ([0-9A-F]{4,5}) +FE0E +; +.+? style; +\# \((\d.\d)\) ([A-Z0-9\- ]+)\n[0-9A-F]{4,5} +FE0F +; +.+? style; +\# \(\d.\d\) [A-Z0-9\- ]+\n | ||
// Replace: {"code": "$1", "textStyle": "$1 FE0E", "emojiStyle": "$1 FE0F", "version": "$2", "name": "$3"},\n | ||
const eVSPath = new URL('./data/emojiVariationSequences.json', import.meta.url); | ||
|
||
// source: https://github.com/iamcal/emoji-data/blob/master/emoji.json | ||
const eJSONPath = new URL('./data/emoji.json', import.meta.url); | ||
|
||
const emojiVariationSequences: EmojiVariationSequence[] = JSON.parse( | ||
fs.readFileSync(eVSPath, 'utf8'), | ||
) as EmojiVariationSequence[]; | ||
// Initialize normalization maps as Maps for faster lookups | ||
const normalizationMap = new Map<string, string>(); | ||
const nonQualifiedMap = new Map<string, string>(); | ||
|
||
const emojiData: Emoji[] = JSON.parse(fs.readFileSync(eJSONPath, 'utf8')) as Emoji[]; | ||
// Cache for memoization | ||
const normalizationCache = new Map<string, string>(); | ||
|
||
// Build normalization maps | ||
let normalizationMap: Record<string, string> = {}; | ||
emojiVariationSequences.forEach((seq) => { | ||
normalizationMap[seq.code] = seq.emojiStyle; | ||
normalizationMap[seq.textStyle] = seq.emojiStyle; | ||
}); | ||
// Function to load and process normalization data asynchronously | ||
async function initializeNormalizationMaps() { | ||
const [eVSData, eJSONData] = await Promise.all([ | ||
fs.promises.readFile(eVSPath, 'utf8'), | ||
fs.promises.readFile(eJSONPath, 'utf8'), | ||
]); | ||
|
||
normalizationMap = lowercaseObject(normalizationMap); | ||
const emojiVariationSequences: EmojiVariationSequence[] = JSON.parse(eVSData) as EmojiVariationSequence[]; | ||
const emojiData: Emoji[] = JSON.parse(eJSONData) as Emoji[]; | ||
|
||
let nonQualifiedMap: Record<string, string> = {}; | ||
emojiData.forEach((emojiEntry) => { | ||
if (emojiEntry.non_qualified && emojiEntry.unified) { | ||
nonQualifiedMap[emojiEntry.non_qualified.replaceAll('-', ' ')] = emojiEntry.unified.replaceAll('-', ' '); | ||
for (const seq of emojiVariationSequences) { | ||
normalizationMap.set(seq.code.toLowerCase(), seq.emojiStyle); | ||
normalizationMap.set(seq.textStyle.toLowerCase(), seq.emojiStyle); | ||
} | ||
|
||
const lowercasedNonQualifiedMap = lowercaseObject(Object.fromEntries(normalizationMap)); | ||
normalizationMap.clear(); | ||
for (const [key, value] of Object.entries(lowercasedNonQualifiedMap)) { | ||
normalizationMap.set(key, value); | ||
} | ||
|
||
for (const emojiEntry of emojiData) { | ||
if (emojiEntry.non_qualified && emojiEntry.unified) { | ||
nonQualifiedMap.set( | ||
emojiEntry.non_qualified.replaceAll('-', ' ').toLowerCase(), | ||
emojiEntry.unified.replaceAll('-', ' ').toLowerCase(), | ||
); | ||
} | ||
} | ||
|
||
const lowercasedNonQualified = lowercaseObject(Object.fromEntries(nonQualifiedMap)); | ||
nonQualifiedMap.clear(); | ||
for (const [key, value] of Object.entries(lowercasedNonQualified)) { | ||
nonQualifiedMap.set(key, value); | ||
} | ||
}); | ||
|
||
nonQualifiedMap = lowercaseObject(nonQualifiedMap); | ||
// Freeze the maps to prevent modifications | ||
Object.freeze(normalizationMap); | ||
Object.freeze(nonQualifiedMap); | ||
} | ||
|
||
// Initialize the maps at startup | ||
initializeNormalizationMaps().catch((error: unknown) => { | ||
console.error('Failed to initialize normalization maps:', error); | ||
process.exit(1); | ||
}); | ||
|
||
export function normalizeEmoji(emoji: string): string { | ||
// First Pass: Variation Sequence Normalization | ||
const emojiCodePoints = emojiToCodePoint(emoji); | ||
let firstPass; | ||
if (normalizationMap[emojiCodePoints]) { | ||
firstPass = normalizationMap[emojiCodePoints]; | ||
} else { | ||
firstPass = emojiCodePoints; | ||
if (normalizationCache.has(emoji)) { | ||
return normalizationCache.get(emoji)!; | ||
} | ||
|
||
// First Pass: Variation Sequence Normalization | ||
const emojiCodePoints = emojiToCodePoint(emoji).toLowerCase(); | ||
const firstPass = normalizationMap.get(emojiCodePoints) ?? emojiCodePoints; | ||
let normalizedEmoji = codePointToEmoji(firstPass); | ||
|
||
// Second Pass: Non-Qualified to Unified Normalization | ||
const unifiedCodePoints = nonQualifiedMap[firstPass]; | ||
const unifiedCodePoints = nonQualifiedMap.get(firstPass); | ||
if (unifiedCodePoints && unifiedCodePoints !== firstPass) { | ||
normalizedEmoji = codePointToEmoji(unifiedCodePoints); | ||
} | ||
|
||
normalizationCache.set(emoji, normalizedEmoji); | ||
return normalizedEmoji; | ||
} | ||
|
||
export function batchNormalizeEmojis(emojis: string[]): string[] { | ||
return emojis.map((emoji) => normalizeEmoji(emoji)); | ||
const result: string[] = new Array<string>(emojis.length); | ||
for (let i = 0; i < emojis.length; i++) { | ||
const emoji = emojis[i]; | ||
if (normalizationCache.has(emoji)) { | ||
result[i] = normalizationCache.get(emoji)!; | ||
if (emojiToCodePoint(emoji) !== emojiToCodePoint(result[i])) { | ||
console.log(`Cache hit for ${emojiToCodePoint(emoji)}: ${emojiToCodePoint(result[i])}`); | ||
} | ||
} else { | ||
// First Pass: Variation Sequence Normalization | ||
const emojiCodePoints = emojiToCodePoint(emoji).toLowerCase(); | ||
const firstPass = normalizationMap.get(emojiCodePoints) ?? emojiCodePoints; | ||
let normalizedEmoji = codePointToEmoji(firstPass); | ||
|
||
// Second Pass: Non-Qualified to Unified Normalization | ||
const unifiedCodePoints = nonQualifiedMap.get(firstPass); | ||
if (unifiedCodePoints && unifiedCodePoints !== firstPass) { | ||
normalizedEmoji = codePointToEmoji(unifiedCodePoints); | ||
} | ||
|
||
normalizationCache.set(emoji, normalizedEmoji); | ||
result[i] = normalizedEmoji; | ||
} | ||
} | ||
return result; | ||
} |