Skip to content

Commit

Permalink
feat: Optimized word count for non-Chinese articles (halo-dev#1865)
Browse files Browse the repository at this point in the history
* feat: word count optimization

Now it can accurately identify the number of words in English articles and mixed language articles with character fieds.

* checkstyle

* delete field charCount

* fix typo and add some complex unit test

* refine unit test

* uniform word count

* fix style
  • Loading branch information
Yhcrown authored May 6, 2022
1 parent 15d2f8e commit 508c41b
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 5 deletions.
6 changes: 4 additions & 2 deletions src/main/java/run/halo/app/model/entity/BasePost.java
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ public class BasePost extends BaseEntity {
private Integer topPriority;

/**
* Likes
* Likes.
*/
@Column(name = "likes")
@ColumnDefault("0")
Expand All @@ -169,7 +169,7 @@ public class BasePost extends BaseEntity {
private String metaDescription;

/**
* Content word count
* Content word count.
*/
@Column(name = "word_count")
@ColumnDefault("0")
Expand All @@ -188,6 +188,7 @@ public class BasePost extends BaseEntity {
@Transient
private PatchedContent content;


@Override
public void prePersist() {
super.prePersist();
Expand Down Expand Up @@ -243,6 +244,7 @@ public void prePersist() {
if (version == null || version < 0) {
version = 1;
}

// Clear the value of the deprecated attributes
this.originalContent = "";
this.formatContent = "";
Expand Down
40 changes: 38 additions & 2 deletions src/main/java/run/halo/app/service/impl/BasePostServiceImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ public abstract class BasePostServiceImpl<POST extends BasePost>

private static final Pattern BLANK_PATTERN = Pattern.compile("\\s");

private static final String CHINESE_REGEX = "[^\\x00-\\xff]";

private static final String PUNCTUATION_REGEX = "[\\p{P}\\p{S}\\p{Z}\\s]+";

public BasePostServiceImpl(BasePostRepository<POST> basePostRepository,
OptionService optionService,
ContentService contentService,
Expand Down Expand Up @@ -301,7 +305,6 @@ public POST createOrUpdateBy(POST post) {
PatchedContent postContent = post.getContent();
// word count stat
post.setWordCount(htmlFormatWordCount(postContent.getContent()));

POST savedPost;
// Create or update post
if (ServiceUtils.isEmptyId(post.getId())) {
Expand Down Expand Up @@ -484,7 +487,7 @@ protected <T extends BasePostSimpleDTO> void generateAndSetSummaryIfAbsent(POST
}
}

// CS304 issue link : https://github.com/halo-dev/halo/issues/1224
// CS304 issue link : https://github.com/halo-dev/halo/issues/1759

/**
* @param htmlContent the markdown style content
Expand All @@ -498,6 +501,39 @@ public static long htmlFormatWordCount(String htmlContent) {

String cleanContent = HaloUtils.cleanHtmlTag(htmlContent);

String tempString = cleanContent.replaceAll(CHINESE_REGEX, "");

String otherString = cleanContent.replaceAll(CHINESE_REGEX, " ");

int chineseWordCount = cleanContent.length() - tempString.length();

String[] otherWords = otherString.split(PUNCTUATION_REGEX);

int otherWordLength = otherWords.length;

if (otherWordLength > 0 && otherWords[0].length() == 0) {
otherWordLength--;
}

if (otherWords.length > 1 && otherWords[otherWords.length - 1].length() == 0) {
otherWordLength--;
}

return chineseWordCount + otherWordLength;
}

/**
* @param htmlContent the markdown style content
* @return character count except space and line separator
*/

public static long htmlFormatCharacterCount(String htmlContent) {
if (htmlContent == null) {
return 0;
}

String cleanContent = HaloUtils.cleanHtmlTag(htmlContent);

Matcher matcher = BLANK_PATTERN.matcher(cleanContent);

int count = 0;
Expand Down
54 changes: 53 additions & 1 deletion src/test/java/run/halo/app/service/impl/HTMLWordCountTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,20 @@ public class HTMLWordCountTest {

String emptyString = "";

String englishString = "I have a red apple";

String hybridString = "I have a red apple哈哈";


String complexText2 = "Hi,Jessica!这个project的schedule有些问题。";

String complexText3 = "The company had a meeting yesterday。Why did you ask for leave?";

String complexText4 = "这是一个句子,但是只有中文。";

String complexText5 =
"The wind and the moon are all beautiful, love and hate are all romantic.";

@Test
void pictureTest() {
assertEquals("图片字数测试".length(),
Expand Down Expand Up @@ -128,4 +142,42 @@ void emptyTest() {
assertEquals(0,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(emptyString)));
}
}

@Test
void englishTest() {
assertEquals(5,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(englishString)));
}

@Test
void hybridTest() {
assertEquals(7,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(hybridString)));
}

@Test
void englishCharacterTest() {
assertEquals(14,
BasePostServiceImpl.htmlFormatCharacterCount(MarkdownUtils.renderHtml(englishString)));
}

@Test
void hybridCharacterTest() {
assertEquals(16,
BasePostServiceImpl.htmlFormatCharacterCount(MarkdownUtils.renderHtml(hybridString)));
}

@Test
void moreComplexTest() {
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText2)));
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText3)));
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText4)));
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText5)));
}


}

0 comments on commit 508c41b

Please sign in to comment.