Skip to content

Commit

Permalink
CJK: improved typography by tweaking punctuations
Browse files Browse the repository at this point in the history
Add support in textlang for flagging CJK punctuations, and
ensuring width adjustment (forced or allowed reduction)
depending on a punctuation context (neighbout, start or
end of line), and differently depending if the language
is Japanese, Simplified Chinese or Traditional Chinese.
This allows, for example, to make consecutive opening or
closing punctuations halfwidth in SC, and some punctuations
at end of line halfwidth, respecting the recommended
typography rules for each language (clreq, jlreq).

In lvtextfm, detect and flag punctuation chars as "flexible"
CJK chars, and use the typography rules when breaking lines
and making words.
When line-breaking, when a CJK char would not fit, and
a break would not be allowed (which would cause a hole
the size of a glyph at end of line, that text justification
would solve by spreading out the glyphs), try to steal
some width from any previous "flexible" punctuation that
stayed full width.
Disable all this when kerning mode is "off", to allow
getting the old behaviour.

In lvfntman, when a CJK glyph got its width reduced,
try to shift the drawing so this CJK glyph appears in
this reduced width as it would naturally (left, right
or centered).
  • Loading branch information
poire-z committed Jun 5, 2022
1 parent a7cea02 commit 3c94f6e
Show file tree
Hide file tree
Showing 7 changed files with 841 additions and 64 deletions.
16 changes: 3 additions & 13 deletions crengine/include/lvfnt.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,22 +247,12 @@ lUInt16 lvfontMeasureText( const lvfont_handle pfont,
#define LCHAR_IS_COLLAPSED_SPACE 0x0200 ///< flag: this char is a space that should not be rendered
#define LCHAR_IS_TO_IGNORE 0x0400 ///< flag: this char is to be ignored/skipped in text measurement and drawing
#define LCHAR_IS_RTL 0x0800 ///< flag: this char is part of a RTL segment

#define LCHAR__AVAILABLE_BIT_13__ 0x1000
#define LCHAR__AVAILABLE_BIT_14__ 0x2000
#define LCHAR_IS_CJK 0x1000 ///< flag: this char is CJK
#define LCHAR_IS_FLEXIBLE_WIDTH_CJK 0x2000 ///< flag: this char is a CJK fullwidth char that can have its
/// nominal width modified (mostly small punctuation)
#define LCHAR__AVAILABLE_BIT_15__ 0x4000
#define LCHAR__AVAILABLE_BIT_16__ 0x8000

// Some idea, if needed:
// #define LCHAR_IS_CJK_NOT_PUNCT 0x1000 ///< flag: this char is part a CJK char but not a punctuation
// #define LCHAR_IS_CJK_LEFT_PUNCT 0x2000 ///< flag: this char is part a CJK left punctuation
// #define LCHAR_IS_CJK_RIGHT_PUNCT 0x4000 ///< flag: this char is part a CJK right punctuation
// #define LCHAR_IS_CJK_PUNCT 0x6000 ///< flag: (for checking) this char is a CJK punctuation (neutral if set)
// #define LCHAR_IS_CJK 0x7000 ///< flag: (for checking) this char is a CJK char

// LCHAR_IS_EOL was not used by any code, and has been replaced by LCHAR_IS_CLUSTER_TAIL
// #define LCHAR_IS_EOL 0x0010 ///< flag: this char is CR or LF


/** \brief returns true if character is unicode space
\param code is character
Expand Down
1 change: 1 addition & 0 deletions crengine/include/lvfntman.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ enum kerning_mode_t {
#define LFNT_HINT_IS_FALLBACK_FONT 0x0010 /// set on recursive Harfbuzz rendering/drawing with a fallback font

#define LFNT_HINT_TRANSFORM_STRETCH 0x0100 /// Glyph(s) are to be stretched so their bounding box fits the provided w/h
#define LFNT_HINT_CJK_ALTERED_WIDTH 0x0200 /// CJK full width glyph is to be shifted to look correct in a non-nominal width

// These 4 translate from LTEXT_TD_* equivalents (see lvtextfm.h). Keep them in sync.
#define LFNT_DRAW_UNDERLINE 0x1000 /// underlined text
Expand Down
5 changes: 3 additions & 2 deletions crengine/include/lvtextfm.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,9 @@ typedef struct
// formatted_word_t flags
#define LTEXT_WORD_CAN_ADD_SPACE_AFTER 0x0001 /// can add space after this word
#define LTEXT_WORD_CAN_HYPH_BREAK_LINE_AFTER 0x0002 /// can break with hyphenation after this word
#define LTEXT_WORD__AVAILABLE_BIT_03__ 0x0004
#define LTEXT_WORD__AVAILABLE_BIT_04__ 0x0008
#define LTEXT_WORD_IS_CJK 0x0004 /// word is a single CJK char
#define LTEXT_WORD_IS_FLEXIBLE_WIDTH_CJK 0x0008 /// word is also a CJK char that may get its nominal width modified (CJK punctuation)
/// (we could share 0x0002 if we need to regain a bit)

#define LTEXT_WORD_IS_LINK_START 0x0010 /// first word of link flag
#define LTEXT_WORD_IS_IMAGE 0x0020 /// word is an image
Expand Down
150 changes: 150 additions & 0 deletions crengine/include/textlang.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,144 @@
#define TEXTLANG_DEFAULT_HYPH_FORCE_ALGORITHMIC false
#define TEXTLANG_FALLBACK_HYPH_DICT_ID U"English_US.pattern" // For languages without specific hyph dicts


// The following CJK categorisation is only used by lvtextfm.cpp - but put here to get
// the specific SC/TC/JA typography rules in textlang.cpp and get lvtextfm generic.

// Fullwidth CJK chars categories, based on jlreq https://www.w3.org/TR/jlreq/#character_classes
// (jlreq does not mention fullwidth ascii unicode codepoints, so we'll consider
// what it mentions about ascii chars for their fullwidth Unicode equivalents)
enum cjk_type_t {
cjkt_other = 0, // Anything not specifically handled (keeps its initial width, can get space after/before)
cjkt_start_of_line,
cjkt_end_of_line,
cjkt_ambiguous_quote, // fullwidth quotation mark or apostrophe
cjkt_opening_bracket, // jlreq cl-01 (opening parenthesis, left quotation mark...)
cjkt_closing_bracket, // jlreq cl-02 (closing parenthesis, right quotation mark...)
cjkt_dividing_punct, // jlreq cl-04 (single and double exclamation and question mark)
cjkt_middle_dot, // jlreq cl-05 (colon, semicolon, middle-dot)
cjkt_full_stop, // jlreq cl-06 (ideographic full stop, ascii fullstop)
cjkt_comma, // jlreq cl-07 (ideographic comma, ascii comma)
cjkt_fullwidth_space, // jlreq cl-14 (fullwidth ideographic space)
CJKT_MAX
// Other jlreq classes have usually larger glyphs that aren't squeezable, so we don't handle them specifically.
};

// Width adjustment tables are defined in textlang.cpp
typedef lInt8 cjk_width_adjustment_table_t[CJKT_MAX][CJKT_MAX];

inline cjk_type_t getCJKCharType( lChar32 ch ) {
// Generic CJK fullwidth punctuation categorization, flagging chars
// that have their glyph blackbox width way smaller than their advance,
// and that could have their width reduced if needed by typography.
// This shouldn't depend on lang_cfg, but how they behave depending
// on their context and neighbours does: this is handled by the
// language specific cjk_width_adjustment_table_t tables used
// by lang_cfg->getCJKWidthAdjustment(current_cjk_type, next_cjk_type).
cjk_type_t cjk_type = cjkt_other;
if ( ch >= 0x3000 && ch <= 0x30FB ) {
switch (ch) {
case 0x3000: // IDEOGRAPHIC SPACE (Zs)
cjk_type = cjkt_fullwidth_space;
break;
case 0x3001: // IDEOGRAPHIC COMMA (Po)
cjk_type = cjkt_comma;
break;
case 0x3002: // IDEOGRAPHIC FULL STOP (Po)
cjk_type = cjkt_full_stop;
break;
case 0x30FB: // KATAKANA MIDDLE DOT (Po)
cjk_type = cjkt_middle_dot;
break;
case 0x3009: // RIGHT ANGLE BRACKET (Pe)
case 0x300B: // RIGHT DOUBLE ANGLE BRACKET (Pe)
case 0x300D: // RIGHT CORNER BRACKET (Pe)
case 0x300F: // RIGHT WHITE CORNER BRACKET (Pe)
case 0x3011: // RIGHT BLACK LENTICULAR BRACKET (Pe)
case 0x3015: // RIGHT TORTOISE SHELL BRACKET (Pe)
case 0x3017: // RIGHT WHITE LENTICULAR BRACKET (Pe)
case 0x3019: // RIGHT WHITE TORTOISE SHELL BRACKET (Pe)
case 0x301B: // RIGHT WHITE SQUARE BRACKET (Pe)
case 0x301E: // DOUBLE PRIME QUOTATION MARK (Pe)
case 0x301F: // LOW DOUBLE PRIME QUOTATION MARK (Pe)
cjk_type = cjkt_closing_bracket;
break;
case 0x3008: // LEFT ANGLE BRACKET (Ps)
case 0x300A: // LEFT DOUBLE ANGLE BRACKET (Ps)
case 0x300C: // LEFT CORNER BRACKET (Ps)
case 0x300E: // LEFT WHITE CORNER BRACKET (Ps)
case 0x3010: // LEFT BLACK LENTICULAR BRACKET (Ps)
case 0x3014: // LEFT TORTOISE SHELL BRACKET (Ps)
case 0x3016: // LEFT WHITE LENTICULAR BRACKET (Ps)
case 0x3018: // LEFT WHITE TORTOISE SHELL BRACKET (Ps)
case 0x301A: // LEFT WHITE SQUARE BRACKET (Ps)
case 0x301D: // REVERSED DOUBLE PRIME QUOTATION MARK (Ps)
cjk_type = cjkt_opening_bracket;
break;
default:
break;
}
}
else if ( ch >= 0xFF01 && ch <= 0xFF60 ) {
switch (ch) {
case 0xFF01: // FULLWIDTH EXCLAMATION MARK (Po)
case 0xFF1F: // FULLWIDTH QUESTION MARK (Po)
cjk_type = cjkt_dividing_punct;
break;
case 0xFF0C: // FULLWIDTH COMMA (Po)
cjk_type = cjkt_comma;
break;
case 0xFF0E: // FULLWIDTH FULL STOP (Po)
cjk_type = cjkt_full_stop;
break;
case 0xFF1A: // FULLWIDTH COLON (Po)
case 0xFF1B: // FULLWIDTH SEMICOLON (Po)
cjk_type = cjkt_middle_dot;
break;
case 0xFF09: // FULLWIDTH RIGHT PARENTHESIS (Pe)
case 0xFF3D: // FULLWIDTH RIGHT SQUARE BRACKET (Pe)
case 0xFF5D: // FULLWIDTH RIGHT CURLY BRACKET (Pe)
case 0xFF60: // FULLWIDTH RIGHT WHITE PARENTHESIS (Pe)
cjk_type = cjkt_closing_bracket;
break;
case 0xFF08: // FULLWIDTH LEFT PARENTHESIS (Ps)
case 0xFF3B: // FULLWIDTH LEFT SQUARE BRACKET (Ps)
case 0xFF5B: // FULLWIDTH LEFT CURLY BRACKET (Ps)
case 0xFF5F: // FULLWIDTH LEFT WHITE PARENTHESIS (Ps)
cjk_type = cjkt_opening_bracket;
break;
case 0xFF02: // FULLWIDTH QUOTATION MARK (Po)
case 0xFF07: // FULLWIDTH APOSTROPHE (Po)
cjk_type = cjkt_ambiguous_quote;
break;
default:
break;
}
}
else if ( ch >= 0x2018 && ch <= 0x201D ) {
// These are not CJK chars, but when using CJK fonts, they may get
// a fullwidth glyph, and we would like to handle these like the ones
// above. This funtion will be called when measureText() detects that
// the glyph might be fullwidth, and there are other CJK glyphs around.
// (We checked all the non-CJK punctuation ranges with various CJK
// fonts, and found out only these 4 ones get a fullwidth glyph.)
switch (ch) {
case 0x2019: // RIGHT SINGLE QUOTATION MARK (Pf)
case 0x201D: // RIGHT DOUBLE QUOTATION MARK (Pf)
cjk_type = cjkt_closing_bracket;
break;
case 0x2018: // LEFT SINGLE QUOTATION MARK (Pi)
case 0x201C: // LEFT DOUBLE QUOTATION MARK (Pi)
cjk_type = cjkt_opening_bracket;
break;
default:
break;
}
}
return cjk_type;
}


class TextLangCfg;

class TextLangMan
Expand Down Expand Up @@ -119,7 +257,12 @@ class TextLangCfg
#endif

bool _duplicate_real_hyphen_on_next_line;

bool _is_ja_zh;
bool _is_ja;
bool _is_zh_TC;
bool _is_zh_SC;
const cjk_width_adjustment_table_t * _cjk_width_adjustment_table;

void resetCounters();

Expand Down Expand Up @@ -162,6 +305,13 @@ class TextLangCfg

bool duplicateRealHyphenOnNextLine() const { return _duplicate_real_hyphen_on_next_line; }

int getCJKWidthAdjustment( cjk_type_t current, cjk_type_t other ) const {
return (int)(*_cjk_width_adjustment_table)[current][other];
}
bool isJapanese() const { return _is_ja; }
bool isSimplifiedChinese() const { return _is_zh_SC; }
bool isTraditionalChinese() const { return _is_zh_TC; }

TextLangCfg( lString32 lang_tag );
~TextLangCfg();
};
Expand Down
89 changes: 88 additions & 1 deletion crengine/src/lvfntman.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3906,6 +3906,43 @@ class LVFreeTypeFace : public LVFont
printf("%x(x=%d+%d,w=%d) ", glyph_info[i].codepoint, x,
item->origin_x + FONT_METRIC_TO_PX(glyph_pos[i].x_offset), w);
#endif
if ( flags & LFNT_HINT_CJK_ALTERED_WIDTH ) {
// We got x and have w of a fullwidth CJK char, normally some punctuation
// char whose blackbox is narrow and smaller or equal to half its width.
// But the position of this blackbox may depends on the opening/closing
// punctuation status, and on the language requested (Simplified Chinese
// get punctuations left- or right-anchored in the glyph, while Traditional
// Chinese may get them centered in the glyph). We only know about the glyph
// returned by the font here, so we should try to guess how to shift the
// drawing to get this glyph to look alright in half of w at the original x.
if ( item->origin_x + item->bmp_width <= w/2 ) {
// Glyph fully in the left half part (ie. Simplified Chinese closing punctuation)
// Nothing to tweak.
}
else if ( item->origin_x >= w/2 ) {
// Glyph fully in the right half part (ie. Simplified Chinese opening punctuation)
x += width - w;
}
else if ( item->origin_x <= w*1/5 && w - item->origin_x - item->bmp_width >= w*2/5) {
// With some fonts (ie. SimSun), some left/right glyphs may leak slightly over
// the middle: do a few more checks to catch these and handle them as above.
// Glyph mostly in the left half part: nothing to tweak
}
else if ( item->origin_x >= w*2/5 && w - item->origin_x - item->bmp_width <= w*1/5) {
// Glyph mostly in the rightly half part
x += width - w;
}
else {
// Glyph overlapping the middle of the glyph (ie. Traditional Chinese opening
// or closing punctuation), so probably centered in its glyph.
// We want to keep it centered in the provided width.
x += (width - w) / 2;
}
// We draw such CJK glyph one by one, so make sure the 'x += w' just below
// gives x=x0+width, which is necessary to correctly draw any underline
w = x0 + width - x;
// Note: no thought given about what we should do if non-zero letter_spacing
}
buf->Draw(x + item->origin_x + FONT_METRIC_TO_PX(glyph_pos[i].x_offset),
y + _baseline - item->origin_y - FONT_METRIC_TO_PX(glyph_pos[i].y_offset),
item->bmp,
Expand Down Expand Up @@ -4007,6 +4044,32 @@ class LVFreeTypeFace : public LVFont
}
else {
// Regular drawing of glyph at the baseline
int cjk_dx = 0;
if ( flags & LFNT_HINT_CJK_ALTERED_WIDTH ) {
// See KERNING_MODE_HARFBUZZ section above for details and comments.
int w = posInfo.width;
if ( item->origin_x + item->bmp_width <= w/2 ) {
// Glyph fully in the left half part
}
else if ( item->origin_x >= w/2 ) {
// Glyph fully in the right half part
x += width - w;
}
else if ( item->origin_x <= w*1/5 && w - item->origin_x - item->bmp_width >= w*2/5) {
// Glyph mostly in the left half part
}
else if ( item->origin_x >= w*2/5 && w - item->origin_x - item->bmp_width <= w*1/5) {
// Glyph mostly in the rightly half part
x += width - w;
}
else {
// Glyph overlapping the middle of the glyph
x += (width - w) / 2;
}
// We draw such CJK glyph one by one, so make sure the 'x += posInfo.width' just below
// gives x=x0+width, which is necessary to correctly draw any underline
cjk_dx = x0 + width - x - posInfo.width;
}
buf->Draw(x + item->origin_x + posInfo.offset,
y + _baseline - item->origin_y,
item->bmp,
Expand All @@ -4016,7 +4079,7 @@ class LVFreeTypeFace : public LVFont
// Assume zero advance means it's a diacritic, and we should not apply
// any letter spacing on this char (now, and when justifying)
if ( posInfo.width != 0 )
x += posInfo.width + letter_spacing;
x += posInfo.width + letter_spacing + cjk_dx;
}
}
}
Expand Down Expand Up @@ -4089,6 +4152,30 @@ class LVFreeTypeFace : public LVFont
}
else {
// Regular drawing of glyph at the baseline
if ( flags & LFNT_HINT_CJK_ALTERED_WIDTH ) {
// See KERNING_MODE_HARFBUZZ section above for details and comments.
if ( item->origin_x + item->bmp_width <= w/2 ) {
// Glyph fully in the left half part
}
else if ( item->origin_x >= w/2 ) {
// Glyph fully in the right half part
x += width - w;
}
else if ( item->origin_x <= w*1/5 && w - item->origin_x - item->bmp_width >= w*2/5) {
// Glyph mostly in the left half part
}
else if ( item->origin_x >= w*2/5 && w - item->origin_x - item->bmp_width <= w*1/5) {
// Glyph mostly in the rightly half part
x += width - w;
}
else {
// Glyph overlapping the middle of the glyph
x += (width - w) / 2;
}
// We draw such CJK glyph one by one, so make sure the 'x += w' just below
// gives x=x0+width, which is necessary to correctly draw any underline
w = x0 + width - x;
}
buf->Draw( x + FONT_METRIC_TO_PX(kerning) + item->origin_x,
y + _baseline - item->origin_y,
item->bmp,
Expand Down
Loading

0 comments on commit 3c94f6e

Please sign in to comment.