Skip to content

Commit

Permalink
update regex matching for cjk characters
Browse files Browse the repository at this point in the history
Regex matching is not kind to non-Latin character sets. Instead of
using word boundaries (\b) for the end of hashtags, I've changed to
a lookahead assertion checking for a space or the end of a line.

I'm going to verify what characters are allowed by twitter in a hashtag
and adjust this lookahead accordingly, in case other characters can break
a hashtag.
  • Loading branch information
kbravh committed Aug 31, 2022
1 parent b254392 commit b0a3fd3
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 3 deletions.
73 changes: 73 additions & 0 deletions __fixtures__/tweets/cjk_tweets.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import {Tweet} from 'src/models'

export const koreanTweet: Tweet = {
data: {
id: '1564281755531722755',
created_at: '2022-08-29T16:00:08.000Z',
text: 'A Minji a day keeps the bad vibes away~\n\nPlaying with our money \n\n#JiU #지유 #Dreamcatcher #드림캐쳐 https://t.co/IRpqINac5X',
attachments: {
media_keys: ['3_1564281751844884482'],
},
conversation_id: '1564281755531722755',
entities: {
hashtags: [
{
start: 66,
end: 70,
tag: 'JiU',
},
{
start: 71,
end: 74,
tag: '지유',
},
{
start: 75,
end: 88,
tag: 'Dreamcatcher',
},
{
start: 89,
end: 94,
tag: '드림캐쳐',
},
],
urls: [
{
start: 95,
end: 118,
url: 'https://t.co/IRpqINac5X',
expanded_url:
'https://twitter.com/PaniclnTheCity/status/1564281755531722755/photo/1',
display_url: 'pic.twitter.com/IRpqINac5X',
media_key: '3_1564281751844884482',
},
],
},
author_id: '2539875322',
public_metrics: {
retweet_count: 160,
reply_count: 4,
like_count: 967,
quote_count: 9,
},
},
includes: {
media: [
{
media_key: '3_1564281751844884482',
type: 'photo',
url: 'https://pbs.twimg.com/media/FbVx9yNXEAIhA9O.jpg',
},
],
users: [
{
profile_image_url:
'https://pbs.twimg.com/profile_images/1534928249217851396/Mn95uof8_normal.jpg',
name: 'tm ツ',
username: 'PaniclnTheCity',
id: '2539875322',
},
],
},
}
2 changes: 2 additions & 0 deletions __fixtures__/tweets/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export * from './tweet_thread'

import {Tweet} from 'src/models'
import {cashtagTweet} from './cashtag_tweet'
import {koreanTweet} from './cjk_tweets'
import {
imageTweet,
imageTweetWithAnnotations,
Expand All @@ -18,6 +19,7 @@ import {oldProfileTweet, newProfileTweet} from './profile_pic_tweets'
import {tweetThread, tweetWithMissingParent} from './tweet_thread'
export const tweets: Record<string, Tweet> = {
[cashtagTweet.data.id]: cashtagTweet,
[koreanTweet.data.id]: koreanTweet,
[imageTweet.data.id]: imageTweet,
[imageTweetWithAnnotations.data.id]: imageTweetWithAnnotations,
[imageTweetWithAnnotationsAndNewlines.data.id]:
Expand Down
38 changes: 38 additions & 0 deletions __tests__/util.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,42 @@ describe('Entity replacements', () => {
"I'm a tweet with an [@ab](https://twitter.com/ab) mention and an [@abc](https://twitter.com/abc) mention."
)
})
it('Correctly replaces entities with CJK characters', () => {
expect(
replaceEntities(
{
hashtags: [
{start: 66, end: 70, tag: 'JiU'},
{start: 71, end: 74, tag: '지유'},
{start: 75, end: 88, tag: 'Dreamcatcher'},
{start: 89, end: 94, tag: '드림캐쳐'},
],
urls: [
{
start: 95,
end: 118,
url: 'https://t.co/IRpqINac5X',
expanded_url:
'https://twitter.com/PaniclnTheCity/status/1564281755531722755/photo/1',
display_url: 'pic.twitter.com/IRpqINac5X',
media_key: '3_1564281751844884482',
},
],
},
'A Minji a day keeps the bad vibes away~\n\nPlaying with our money \n\n#JiU #지유 #Dreamcatcher #드림캐쳐 https://t.co/IRpqINac5X'
)
).toBe("A Minji a day keeps the bad vibes away~\n\nPlaying with our money \n\n[#JiU](https://twitter.com/hashtag/JiU) [#지유](https://twitter.com/hashtag/지유) [#Dreamcatcher](https://twitter.com/hashtag/Dreamcatcher) [#드림캐쳐](https://twitter.com/hashtag/드림캐쳐) [pic.twitter.com/IRpqINac5X](https://twitter.com/PaniclnTheCity/status/1564281755531722755/photo/1)")
})
it('Does not incorrectly split hashtags with CJK characters', () => {
expect(
replaceEntities(
{
hashtags: [
{start: 1, end: 2, tag: '드림'},
{start: 3, end: 6, tag: '드림캐쳐'},
]
}, 'A tweet with a #드림 hashtag and a #드림캐쳐 hashtag.'
)
)
})
})
6 changes: 3 additions & 3 deletions src/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -363,19 +363,19 @@ export const replaceEntities = (entities: Entities, text: string): string => {
})
mentions.forEach(username => {
text = text.replace(
new RegExp(`@${username}\\b`, 'g'),
new RegExp(`@${username}(?= |$)`, 'gm'),
`[@${username}](https://twitter.com/${username})`
)
})
tags.forEach(tag => {
text = text.replace(
new RegExp(`#${tag}\\b`, 'g'),
new RegExp(`#${tag}(?= |$)`, 'gm'),
`[#${tag}](https://twitter.com/hashtag/${tag})`
)
})
cashtags.forEach(tag => {
text = text.replace(
new RegExp(`\\$${tag}\\b`, 'g'),
new RegExp(`\\$${tag}(?= |$)`, 'gm'),
`[$${tag}](https://twitter.com/search?q=%24${tag})`
)
})
Expand Down

0 comments on commit b0a3fd3

Please sign in to comment.