Skip to content

Commit

Permalink
correct word break and space addition in case of caracter composition
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Apr 20, 2021
1 parent 3206950 commit 74a99fb
Showing 1 changed file with 43 additions and 12 deletions.
55 changes: 43 additions & 12 deletions src/XmlAltoOutputDev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -733,10 +733,10 @@ TextWord::TextWord(GList *charsA, int start, int lenA,
}
}

if (!isUpdateAccentedChar)
if (!isUpdateAccentedChar) {
//text[i] = ch->c;
chars->append(ch);

}

charPos[i] = ch->charPos;
if (i == len - 1) {
Expand Down Expand Up @@ -1074,7 +1074,6 @@ void TextRawWord::addChar(GfxState *state, double x, double y, double dx,

if (overlap && len > 0) {
ModifierClass leftClass = NOT_A_MODIFIER, rightClass = NOT_A_MODIFIER;
//Unicode prvChar = text[len - 1];
Unicode prvChar = ((TextChar *) chars->get(len - 1))->c;
leftClass = classifyChar(prvChar);
rightClass = classifyChar(u);
Expand All @@ -1090,6 +1089,10 @@ void TextRawWord::addChar(GfxState *state, double x, double y, double dx,
diactritic = getCombiningDiacritic(leftClass);
baseChar = new UnicodeString(wchar_t(getStandardBaseChar(u)));
}
// note that in this case we have to be careful with the word coordinates, as the first
// character of the word might be a modifier, we should use the base char instead
// otherwise we could introduce spurious word break after the combined character due
// to shift from the base line
} else if (rightClass != NOT_A_MODIFIER) {
diactritic = getCombiningDiacritic(rightClass);
baseChar = new UnicodeString(wchar_t(getStandardBaseChar(prvChar)));
Expand All @@ -1104,9 +1107,8 @@ void TextRawWord::addChar(GfxState *state, double x, double y, double dx,
resultChar = nfkc->normalizeSecondAndAppend(resultChar, *diacriticChar, errorCode);
} else
resultChar = nfkc->normalizeSecondAndAppend(*baseChar, *diacriticChar, errorCode);
//text[len - 1] = resultChar.charAt(0);
((TextChar *) chars->get(len - 1))->c = resultChar.charAt(0);
//here we should compare both coords and keep surrounding ones

switch (rot) {
case 0:
if (len == 0) {
Expand Down Expand Up @@ -2714,8 +2716,8 @@ void TextPage::addCharToRawWord(GfxState *state, double x, double y, double dx,
// (2) this character overlaps the previous one (duplicated text), or
// (3) the previous character was an overlap (we want each duplicated
// character to be in a word by itself at this stage)
// characters to be in a word by itself) // HD deleted
if (curWord && curWord->len > 0) {

base = sp = delta = 0; // make gcc happy
switch (curWord->rot) {
case 0:
Expand All @@ -2739,6 +2741,13 @@ void TextPage::addCharToRawWord(GfxState *state, double x, double y, double dx,
delta = curWord->edge[curWord->len - 1] - y1;
break;
}

if (lastCharOverlap) {
// the previous overlap (always from a character composition here)
// made the base not reliable, so we align to the current one
curWord->base = base;
}

sp -= curWord->charSpace;
curWord->charSpace = state->getCharSpace();
overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize && fabs(base
Expand All @@ -2747,26 +2756,48 @@ void TextPage::addCharToRawWord(GfxState *state, double x, double y, double dx,

// avoid splitting token when overlaping is surrounded by diacritic
ModifierClass modifierClass = NOT_A_MODIFIER;
ModifierClass rightClass = NOT_A_MODIFIER;
ModifierClass leftClass = NOT_A_MODIFIER;
if (curWord->len > 0)
modifierClass = classifyChar(((TextChar *) curWord->chars->get(curWord->getLength() - 1))->c);
if (modifierClass == NOT_A_MODIFIER)
modifierClass = classifyChar(u[0]);
leftClass = classifyChar(((TextChar *) curWord->chars->get(curWord->getLength() - 1))->c);
//if (modifierClass == NOT_A_MODIFIER)
rightClass = classifyChar(u[0]);
GBool space = sp > minWordBreakSpace * curWord->fontSize;

if ((rightClass != NOT_A_MODIFIER || leftClass != NOT_A_MODIFIER) ) {
// break of the word happens at the modifier char, but depending on the
// left or right combination, it will consume the left or the right character
// and possible break word will occur then after or before the modifier respectively
if (leftClass != NOT_A_MODIFIER) {
// no break before this char
modifierClass = leftClass;
}
else if (rightClass != NOT_A_MODIFIER) {
// break is allowed before this char
modifierClass = NOT_A_MODIFIER;
}
}

if(space){
curWord->setSpaceAfter(gTrue);
if (curWord->chars->getLength() > 0)
((TextChar *) curWord->chars->get(curWord->chars->getLength() - 1))->spaceAfter =
(char) gTrue;
}
// take into account rotation angle ??
if (((overlap || fabs(base - curWord->base) > 1 ||
space ||
(sp < -minDupBreakOverlap * curWord->fontSize)) && modifierClass == NOT_A_MODIFIER)) {
if ( (overlap ||
fabs(base - curWord->base) > 1 ||
space ||
(sp < -minDupBreakOverlap * curWord->fontSize))
&& modifierClass == NOT_A_MODIFIER) {
endWord();
beginWord(state, x, y);
}
lastCharOverlap = overlap;
if (leftClass != NOT_A_MODIFIER) {
// we keep track of the event composition for the next character, as the base won't be reliable
lastCharOverlap = gTrue;
}
} else {
lastCharOverlap = gFalse;
}
Expand Down

0 comments on commit 74a99fb

Please sign in to comment.