X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=blobdiff_plain;f=pdf%2Fxpdf%2FTextOutputDev.cc;h=a492e7f3e178c2b1c4d90bb2953af97059acacb8;hb=6d7f9e7cf7678b48828be0722ae2e998ce85b7a7;hp=3fcc9ec294ecdbd1e7800c98764c0928e04ca6c8;hpb=7c5ab64d4db75e4bb6dadedb578e10178554d0db;p=evince.git diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc index 3fcc9ec2..a492e7f3 100644 --- a/pdf/xpdf/TextOutputDev.cc +++ b/pdf/xpdf/TextOutputDev.cc @@ -24,10 +24,11 @@ #include "gmem.h" #include "GString.h" #include "GList.h" -#include "config.h" +#include "xpdfconfig.h" #include "Error.h" #include "GlobalParams.h" #include "UnicodeMap.h" +#include "UnicodeTypeTable.h" #include "GfxState.h" #include "TextOutputDev.h" @@ -40,176 +41,195 @@ // parameters //------------------------------------------------------------------------ -// Minium and maximum inter-word spacing (as a fraction of the average -// character width). -#define wordMinSpaceWidth 0.3 -#define wordMaxSpaceWidth 2.0 - -// Default min and max inter-word spacing (when the average character -// width is unknown). -#define wordDefMinSpaceWidth 0.2 -#define wordDefMaxSpaceWidth 1.5 - -// Max difference in x,y coordinates (as a fraction of the font size) -// allowed for duplicated text (fake boldface, drop shadows) which is -// to be discarded. -#define dupMaxDeltaX 0.1 -#define dupMaxDeltaY 0.2 - -// Min overlap (as a fraction of the font size) required for two -// lines to be considered vertically overlapping. -#define lineOverlapSlack 0.5 - -// Max difference in baseline y coordinates (as a fraction of the font -// size) allowed for words which are to be grouped into a line, not -// including sub/superscripts. -#define lineMaxBaselineDelta 0.1 - -// Max ratio of font sizes allowed for words which are to be grouped -// into a line, not including sub/superscripts. -#define lineMaxFontSizeRatio 1.4 - -// Min spacing (as a fraction of the font size) allowed between words -// which are to be grouped into a line. -#define lineMinDeltaX -0.5 - -// Minimum vertical overlap (as a fraction of the font size) required -// for superscript and subscript words. -#define lineMinSuperscriptOverlap 0.3 -#define lineMinSubscriptOverlap 0.3 - -// Min/max ratio of font sizes allowed for sub/superscripts compared to -// the base text. -#define lineMinSubscriptFontSizeRatio 0.4 -#define lineMaxSubscriptFontSizeRatio 1.01 -#define lineMinSuperscriptFontSizeRatio 0.4 -#define lineMaxSuperscriptFontSizeRatio 1.01 - -// Max horizontal spacing (as a fraction of the font size) allowed -// before sub/superscripts. -#define lineMaxSubscriptDeltaX 0.2 -#define lineMaxSuperscriptDeltaX 0.2 - -// Maximum vertical spacing (as a fraction of the font size) allowed -// for lines which are to be grouped into a block. -#define blkMaxSpacing 2.0 - -// Max ratio of primary font sizes allowed for lines which are to be -// grouped into a block. -#define blkMaxFontSizeRatio 1.3 - -// Min overlap (as a fraction of the font size) required for two -// blocks to be considered vertically overlapping. -#define blkOverlapSlack 0.5 - -// Max vertical spacing (as a fraction of the font size) allowed -// between blocks which are 'adjacent' when sorted by reading order. -#define blkMaxSortSpacing 2.0 - -// Max vertical offset (as a fraction of the font size) of the top and -// bottom edges allowed for blocks which are to be grouped into a -// flow. -#define flowMaxDeltaY 1.0 +// Each bucket in a text pool includes baselines within a range of +// this many points. +#define textPoolStep 4 + +// Inter-character space width which will cause addChar to start a new +// word. +#define minWordBreakSpace 0.1 + +// Negative inter-character space width, i.e., overlap, which will +// cause addChar to start a new word. +#define minDupBreakOverlap 0.2 + +// Max distance between baselines of two lines within a block, as a +// fraction of the font size. +#define maxLineSpacingDelta 1.5 + +// Max difference in primary font sizes on two lines in the same +// block. Delta1 is used when examining new lines above and below the +// current block; delta2 is used when examining text that overlaps the +// current block; delta3 is used when examining text to the left and +// right of the current block. +#define maxBlockFontSizeDelta1 0.05 +#define maxBlockFontSizeDelta2 0.6 +#define maxBlockFontSizeDelta3 0.2 + +// Max difference in font sizes inside a word. +#define maxWordFontSizeDelta 0.05 + +// Maximum distance between baselines of two words on the same line, +// e.g., distance between subscript or superscript and the primary +// baseline, as a fraction of the font size. +#define maxIntraLineDelta 0.5 + +// Minimum inter-word spacing, as a fraction of the font size. (Only +// used for raw ordering.) +#define minWordSpacing 0.15 + +// Maximum inter-word spacing, as a fraction of the font size. +#define maxWordSpacing 1.5 + +// Maximum horizontal spacing which will allow a word to be pulled +// into a block. +#define minColSpacing1 0.3 + +// Minimum spacing between columns, as a fraction of the font size. +#define minColSpacing2 1.0 + +// Maximum vertical spacing between blocks within a flow, as a +// multiple of the font size. +#define maxBlockSpacing 2.5 + +// Minimum spacing between characters within a word, as a fraction of +// the font size. +#define minCharSpacing -0.2 + +// Maximum spacing between characters within a word, as a fraction of +// the font size, when there is no obvious extra-wide character +// spacing. +#define maxCharSpacing 0.03 + +// When extra-wide character spacing is detected, the inter-character +// space threshold is set to the minimum inter-character space +// multiplied by this constant. +#define maxWideCharSpacingMul 1.3 + +// Max difference in primary,secondary coordinates (as a fraction of +// the font size) allowed for duplicated text (fake boldface, drop +// shadows) which is to be discarded. +#define dupMaxPriDelta 0.1 +#define dupMaxSecDelta 0.2 //------------------------------------------------------------------------ // TextFontInfo //------------------------------------------------------------------------ TextFontInfo::TextFontInfo(GfxState *state) { - double *textMat; - double t1, t2, avgWidth, w; - int n, i; - gfxFont = state->getFont(); - textMat = state->getTextMat(); - horizScaling = state->getHorizScaling(); - if ((t1 = fabs(textMat[0])) > 0.01 && - (t2 = fabs(textMat[3])) > 0.01) { - horizScaling *= t1 / t2; - } - - minSpaceWidth = horizScaling * wordDefMinSpaceWidth; - maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth; - if (gfxFont && gfxFont->isCIDFont()) { - //~ handle 16-bit fonts - } else if (gfxFont && gfxFont->getType() != fontType3) { - avgWidth = 0; - n = 0; - for (i = 0; i < 256; ++i) { - w = ((Gfx8BitFont *)gfxFont)->getWidth(i); - if (w > 0) { - avgWidth += w; - ++n; - } - } - if (n > 0) { - avgWidth /= n; - minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth; - maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth; - } - } - +#if TEXTOUT_WORD_LIST + fontName = (gfxFont && gfxFont->getOrigName()) + ? gfxFont->getOrigName()->copy() + : (GString *)NULL; +#endif } TextFontInfo::~TextFontInfo() { +#if TEXTOUT_WORD_LIST + if (fontName) { + delete fontName; + } +#endif } GBool TextFontInfo::matches(GfxState *state) { - double *textMat; - double t1, t2, h; - - textMat = state->getTextMat(); - h = state->getHorizScaling(); - if ((t1 = fabs(textMat[0])) > 0.01 && - (t2 = fabs(textMat[3])) > 0.01) { - h *= t1 / t2; - } - return state->getFont() == gfxFont && - fabs(h - horizScaling) < 0.01; + return state->getFont() == gfxFont; } //------------------------------------------------------------------------ // TextWord //------------------------------------------------------------------------ -TextWord::TextWord(GfxState *state, double x0, double y0, int charPosA, - TextFontInfo *fontA, double fontSizeA) { +TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, + int charPosA, TextFontInfo *fontA, double fontSizeA) { GfxFont *gfxFont; - double x, y; + double x, y, ascent, descent; + rot = rotA; charPos = charPosA; charLen = 0; font = fontA; fontSize = fontSizeA; state->transform(x0, y0, &x, &y); if ((gfxFont = font->gfxFont)) { - yMin = y - gfxFont->getAscent() * fontSize; - yMax = y - gfxFont->getDescent() * fontSize; + ascent = gfxFont->getAscent() * fontSize; + descent = gfxFont->getDescent() * fontSize; } else { // this means that the PDF file draws text without a current font, // which should never happen - yMin = y - 0.95 * fontSize; - yMax = y + 0.35 * fontSize; - } - if (yMin == yMax) { - // this is a sanity check for a case that shouldn't happen -- but - // if it does happen, we want to avoid dividing by zero later - yMin = y; - yMax = y + 1; + ascent = 0.95 * fontSize; + descent = -0.35 * fontSize; + } + switch (rot) { + case 0: + yMin = y - ascent; + yMax = y - descent; + if (yMin == yMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + yMin = y; + yMax = y + 1; + } + base = y; + break; + case 1: + xMin = x + descent; + xMax = x + ascent; + if (xMin == xMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + xMin = x; + xMax = x + 1; + } + base = x; + break; + case 2: + yMin = y + descent; + yMax = y + ascent; + if (yMin == yMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + yMin = y; + yMax = y + 1; + } + base = y; + break; + case 3: + xMin = x - ascent; + xMax = x - descent; + if (xMin == xMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + xMin = x; + xMax = x + 1; + } + base = x; + break; } - yBase = y; text = NULL; - xRight = NULL; + edge = NULL; len = size = 0; spaceAfter = gFalse; next = NULL; -} +#if TEXTOUT_WORD_LIST + GfxRGB rgb; + if ((state->getRender() & 3) == 1) { + state->getStrokeRGB(&rgb); + } else { + state->getFillRGB(&rgb); + } + colorR = rgb.r; + colorG = rgb.g; + colorB = rgb.b; +#endif +} TextWord::~TextWord() { gfree(text); - gfree(xRight); + gfree(edge); } void TextWord::addChar(GfxState *state, double x, double y, @@ -217,234 +237,1457 @@ void TextWord::addChar(GfxState *state, double x, double y, if (len == size) { size += 16; text = (Unicode *)grealloc(text, size * sizeof(Unicode)); - xRight = (double *)grealloc(xRight, size * sizeof(double)); + edge = (double *)grealloc(edge, (size + 1) * sizeof(double)); } text[len] = u; - if (len == 0) { - xMin = x; + switch (rot) { + case 0: + if (len == 0) { + xMin = x; + } + edge[len] = x; + xMax = edge[len+1] = x + dx; + break; + case 1: + if (len == 0) { + yMin = y; + } + edge[len] = y; + yMax = edge[len+1] = y + dy; + break; + case 2: + if (len == 0) { + xMax = x; + } + edge[len] = x; + xMin = edge[len+1] = x + dx; + break; + case 3: + if (len == 0) { + yMax = y; + } + edge[len] = y; + yMin = edge[len+1] = y + dy; + break; } - xMax = xRight[len] = x + dx; ++len; } -// Returns true if comes before in xy order. -GBool TextWord::xyBefore(TextWord *word2) { - return xMin < word2->xMin || - (xMin == word2->xMin && yMin < word2->yMin); -} - -// Merge another word onto the end of this one. -void TextWord::merge(TextWord *word2) { +void TextWord::merge(TextWord *word) { int i; - xMax = word2->xMax; - if (word2->yMin < yMin) { - yMin = word2->yMin; + if (word->xMin < xMin) { + xMin = word->xMin; } - if (word2->yMax > yMax) { - yMax = word2->yMax; + if (word->yMin < yMin) { + yMin = word->yMin; } - if (len + word2->len > size) { - size = len + word2->len; + if (word->xMax > xMax) { + xMax = word->xMax; + } + if (word->yMax > yMax) { + yMax = word->yMax; + } + if (len + word->len > size) { + size = len + word->len; text = (Unicode *)grealloc(text, size * sizeof(Unicode)); - xRight = (double *)grealloc(xRight, size * sizeof(double)); + edge = (double *)grealloc(edge, (size + 1) * sizeof(double)); + } + for (i = 0; i < word->len; ++i) { + text[len + i] = word->text[i]; + edge[len + i] = word->edge[i]; + } + edge[len + word->len] = word->edge[word->len]; + len += word->len; + charLen += word->charLen; +} + +inline int TextWord::primaryCmp(TextWord *word) { + double cmp; + + cmp = 0; // make gcc happy + switch (rot) { + case 0: + cmp = xMin - word->xMin; + break; + case 1: + cmp = yMin - word->yMin; + break; + case 2: + cmp = word->xMax - xMax; + break; + case 3: + cmp = word->yMax - yMax; + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +double TextWord::primaryDelta(TextWord *word) { + double delta; + + delta = 0; // make gcc happy + switch (rot) { + case 0: + delta = word->xMin - xMax; + break; + case 1: + delta = word->yMin - yMax; + break; + case 2: + delta = xMin - word->xMax; + break; + case 3: + delta = yMin - word->yMax; + break; + } + return delta; +} + +int TextWord::cmpYX(const void *p1, const void *p2) { + TextWord *word1 = *(TextWord **)p1; + TextWord *word2 = *(TextWord **)p2; + double cmp; + + cmp = word1->yMin - word2->yMin; + if (cmp == 0) { + cmp = word1->xMin - word2->xMin; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +#if TEXTOUT_WORD_LIST + +GString *TextWord::getText() { + GString *s; + UnicodeMap *uMap; + char buf[8]; + int n, i; + + s = new GString(); + if (!(uMap = globalParams->getTextEncoding())) { + return s; } - for (i = 0; i < word2->len; ++i) { - text[len + i] = word2->text[i]; - xRight[len + i] = word2->xRight[i]; + for (i = 0; i < len; ++i) { + n = uMap->mapUnicode(text[i], buf, sizeof(buf)); + s->append(buf, n); } - len += word2->len; - charLen += word2->charLen; + uMap->decRefCnt(); + return s; +} + +#endif // TEXTOUT_WORD_LIST + +//------------------------------------------------------------------------ +// TextPool +//------------------------------------------------------------------------ + +TextPool::TextPool() { + minBaseIdx = 0; + maxBaseIdx = -1; + pool = NULL; + cursor = NULL; + cursorBaseIdx = -1; +} + +TextPool::~TextPool() { + int baseIdx; + TextWord *word, *word2; + + for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { + for (word = pool[baseIdx - minBaseIdx]; word; word = word2) { + word2 = word->next; + delete word; + } + } + gfree(pool); +} + +int TextPool::getBaseIdx(double base) { + int baseIdx; + + baseIdx = (int)(base / textPoolStep); + if (baseIdx < minBaseIdx) { + return minBaseIdx; + } + if (baseIdx > maxBaseIdx) { + return maxBaseIdx; + } + return baseIdx; +} + +void TextPool::addWord(TextWord *word) { + TextWord **newPool; + int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx; + TextWord *w0, *w1; + + // expand the array if needed + wordBaseIdx = (int)(word->base / textPoolStep); + if (minBaseIdx > maxBaseIdx) { + minBaseIdx = wordBaseIdx - 128; + maxBaseIdx = wordBaseIdx + 128; + pool = (TextWord **)gmalloc((maxBaseIdx - minBaseIdx + 1) * + sizeof(TextWord *)); + for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { + pool[baseIdx - minBaseIdx] = NULL; + } + } else if (wordBaseIdx < minBaseIdx) { + newMinBaseIdx = wordBaseIdx - 128; + newPool = (TextWord **)gmalloc((maxBaseIdx - newMinBaseIdx + 1) * + sizeof(TextWord *)); + for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) { + newPool[baseIdx - newMinBaseIdx] = NULL; + } + memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool, + (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *)); + gfree(pool); + pool = newPool; + minBaseIdx = newMinBaseIdx; + } else if (wordBaseIdx > maxBaseIdx) { + newMaxBaseIdx = wordBaseIdx + 128; + pool = (TextWord **)grealloc(pool, (newMaxBaseIdx - minBaseIdx + 1) * + sizeof(TextWord *)); + for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) { + pool[baseIdx - minBaseIdx] = NULL; + } + maxBaseIdx = newMaxBaseIdx; + } + + // insert the new word + if (cursor && wordBaseIdx == cursorBaseIdx && + word->primaryCmp(cursor) > 0) { + w0 = cursor; + w1 = cursor->next; + } else { + w0 = NULL; + w1 = pool[wordBaseIdx - minBaseIdx]; + } + for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ; + word->next = w1; + if (w0) { + w0->next = word; + } else { + pool[wordBaseIdx - minBaseIdx] = word; + } + cursor = word; + cursorBaseIdx = wordBaseIdx; } //------------------------------------------------------------------------ // TextLine //------------------------------------------------------------------------ -TextLine::TextLine() { - words = NULL; +TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) { + blk = blkA; + rot = rotA; + xMin = yMin = 0; + xMax = yMax = -1; + base = baseA; + words = lastWord = NULL; text = NULL; - xRight = NULL; + edge = NULL; col = NULL; len = 0; + convertedLen = 0; hyphenated = gFalse; - pageNext = NULL; next = NULL; - flowNext = NULL; } TextLine::~TextLine() { - TextWord *w1, *w2; + TextWord *word; - for (w1 = words; w1; w1 = w2) { - w2 = w1->next; - delete w1; + while (words) { + word = words; + words = words->next; + delete word; } gfree(text); - gfree(xRight); + gfree(edge); gfree(col); } -// Returns true if comes before in yx order, allowing -// slack for vertically overlapping lines. -GBool TextLine::yxBefore(TextLine *line2) { - double dy; +void TextLine::addWord(TextWord *word) { + if (lastWord) { + lastWord->next = word; + } else { + words = word; + } + lastWord = word; + + if (xMin > xMax) { + xMin = word->xMin; + xMax = word->xMax; + yMin = word->yMin; + yMax = word->yMax; + } else { + if (word->xMin < xMin) { + xMin = word->xMin; + } + if (word->xMax > xMax) { + xMax = word->xMax; + } + if (word->yMin < yMin) { + yMin = word->yMin; + } + if (word->yMax > yMax) { + yMax = word->yMax; + } + } +} - dy = lineOverlapSlack * fontSize; +double TextLine::primaryDelta(TextLine *line) { + double delta; - // non-overlapping case - if (line2->yMin > yMax - dy || - line2->yMax < yMin + dy) { - return yMin < line2->yMin || - (yMin == line2->yMin && xMin < line2->xMin); + delta = 0; // make gcc happy + switch (rot) { + case 0: + delta = line->xMin - xMax; + break; + case 1: + delta = line->yMin - yMax; + break; + case 2: + delta = xMin - line->xMax; + break; + case 3: + delta = yMin - line->yMax; + break; } + return delta; +} + +int TextLine::primaryCmp(TextLine *line) { + double cmp; - // overlapping case - return xMin < line2->xMin; + cmp = 0; // make gcc happy + switch (rot) { + case 0: + cmp = xMin - line->xMin; + break; + case 1: + cmp = yMin - line->yMin; + break; + case 2: + cmp = line->xMax - xMax; + break; + case 3: + cmp = line->yMax - yMax; + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } -// Merge another line's words onto the end of this line. -void TextLine::merge(TextLine *line2) { - int newLen, i; +int TextLine::secondaryCmp(TextLine *line) { + double cmp; - xMax = line2->xMax; - if (line2->yMin < yMin) { - yMin = line2->yMin; + cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base; + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextLine::cmpYX(TextLine *line) { + int cmp; + + if ((cmp = secondaryCmp(line))) { + return cmp; } - if (line2->yMax > yMax) { - yMax = line2->yMax; + return primaryCmp(line); +} + +int TextLine::cmpXY(const void *p1, const void *p2) { + TextLine *line1 = *(TextLine **)p1; + TextLine *line2 = *(TextLine **)p2; + int cmp; + + if ((cmp = line1->primaryCmp(line2))) { + return cmp; } - xSpaceR = line2->xSpaceR; - lastWord->spaceAfter = gTrue; - lastWord->next = line2->words; - lastWord = line2->lastWord; - line2->words = NULL; - newLen = len + 1 + line2->len; - text = (Unicode *)grealloc(text, newLen * sizeof(Unicode)); - xRight = (double *)grealloc(xRight, newLen * sizeof(double)); - text[len] = (Unicode)0x0020; - xRight[len] = line2->xMin; - for (i = 0; i < line2->len; ++i) { - text[len + 1 + i] = line2->text[i]; - xRight[len + 1 + i] = line2->xRight[i]; + return line1->secondaryCmp(line2); +} + +void TextLine::coalesce(UnicodeMap *uMap) { + TextWord *word0, *word1; + double space, delta, minSpace; + GBool isUnicode; + char buf[8]; + int i, j; + + if (words->next) { + + // compute the inter-word space threshold + if (words->len > 1 || words->next->len > 1) { + minSpace = 0; + } else { + minSpace = words->primaryDelta(words->next); + for (word0 = words->next, word1 = word0->next; + word1 && minSpace > 0; + word0 = word1, word1 = word0->next) { + if (word1->len > 1) { + minSpace = 0; + } + delta = word0->primaryDelta(word1); + if (delta < minSpace) { + minSpace = delta; + } + } + } + if (minSpace <= 0) { + space = maxCharSpacing * words->fontSize; + } else { + space = maxWideCharSpacingMul * minSpace; + } + + // merge words + word0 = words; + word1 = words->next; + while (word1) { + if (word0->primaryDelta(word1) >= space) { + word0->spaceAfter = gTrue; + word0 = word1; + word1 = word1->next; + } else if (word0->font == word1->font && + fabs(word0->fontSize - word1->fontSize) < + maxWordFontSizeDelta * words->fontSize && + word1->charPos == word0->charPos + word0->charLen) { + word0->merge(word1); + word0->next = word1->next; + delete word1; + word1 = word0->next; + } else { + word0 = word1; + word1 = word1->next; + } + } } - len = newLen; - convertedLen += line2->convertedLen; - hyphenated = line2->hyphenated; + + // build the line text + isUnicode = uMap ? uMap->isUnicode() : gFalse; + len = 0; + for (word1 = words; word1; word1 = word1->next) { + len += word1->len; + if (word1->spaceAfter) { + ++len; + } + } + text = (Unicode *)gmalloc(len * sizeof(Unicode)); + edge = (double *)gmalloc((len + 1) * sizeof(double)); + i = 0; + for (word1 = words; word1; word1 = word1->next) { + for (j = 0; j < word1->len; ++j) { + text[i] = word1->text[j]; + edge[i] = word1->edge[j]; + ++i; + } + edge[i] = word1->edge[word1->len]; + if (word1->spaceAfter) { + text[i] = (Unicode)0x0020; + ++i; + } + } + + // compute convertedLen and set up the col array + col = (int *)gmalloc((len + 1) * sizeof(int)); + convertedLen = 0; + for (i = 0; i < len; ++i) { + col[i] = convertedLen; + if (isUnicode) { + ++convertedLen; + } else if (uMap) { + convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf)); + } + } + col[len] = convertedLen; + + // check for hyphen at end of line + //~ need to check for other chars used as hyphens + hyphenated = text[len - 1] == (Unicode)'-'; +} + +//------------------------------------------------------------------------ +// TextLineFrag +//------------------------------------------------------------------------ + +class TextLineFrag { +public: + + TextLine *line; // the line object + int start, len; // offset and length of this fragment + // (in Unicode chars) + double xMin, xMax; // bounding box coordinates + double yMin, yMax; + double base; // baseline virtual coordinate + int col; // first column + + void init(TextLine *lineA, int startA, int lenA); + void computeCoords(GBool oneRot); + + static int cmpYXPrimaryRot(const void *p1, const void *p2); + static int cmpYXLineRot(const void *p1, const void *p2); + static int cmpXYLineRot(const void *p1, const void *p2); +}; + +void TextLineFrag::init(TextLine *lineA, int startA, int lenA) { + line = lineA; + start = startA; + len = lenA; + col = line->col[start]; +} + +void TextLineFrag::computeCoords(GBool oneRot) { + TextBlock *blk; + double d0, d1, d2, d3, d4; + + if (oneRot) { + + switch (line->rot) { + case 0: + xMin = line->edge[start]; + xMax = line->edge[start + len]; + yMin = line->yMin; + yMax = line->yMax; + break; + case 1: + xMin = line->xMin; + xMax = line->xMax; + yMin = line->edge[start]; + yMax = line->edge[start + len]; + break; + case 2: + xMin = line->edge[start + len]; + xMax = line->edge[start]; + yMin = line->yMin; + yMax = line->yMax; + break; + case 3: + xMin = line->xMin; + xMax = line->xMax; + yMin = line->edge[start + len]; + yMax = line->edge[start]; + break; + } + base = line->base; + + } else { + + if (line->rot == 0 && line->blk->page->primaryRot == 0) { + + xMin = line->edge[start]; + xMax = line->edge[start + len]; + yMin = line->yMin; + yMax = line->yMax; + base = line->base; + + } else { + + blk = line->blk; + d0 = line->edge[start]; + d1 = line->edge[start + len]; + d2 = d3 = d4 = 0; // make gcc happy + + switch (line->rot) { + case 0: + d2 = line->yMin; + d3 = line->yMax; + d4 = line->base; + d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin); + d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin); + d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin); + d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin); + d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin); + break; + case 1: + d2 = line->xMax; + d3 = line->xMin; + d4 = line->base; + d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin); + d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin); + d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin); + d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin); + d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin); + break; + case 2: + d2 = line->yMax; + d3 = line->yMin; + d4 = line->base; + d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin); + d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin); + d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin); + d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin); + d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin); + break; + case 3: + d2 = line->xMin; + d3 = line->xMax; + d4 = line->base; + d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin); + d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin); + d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin); + d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin); + d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin); + break; + } + + switch (line->blk->page->primaryRot) { + case 0: + xMin = blk->xMin + d0 * (blk->xMax - blk->xMin); + xMax = blk->xMin + d1 * (blk->xMax - blk->xMin); + yMin = blk->yMin + d2 * (blk->yMax - blk->yMin); + yMax = blk->yMin + d3 * (blk->yMax - blk->yMin); + base = blk->yMin + base * (blk->yMax - blk->yMin); + break; + case 1: + xMin = blk->xMax - d3 * (blk->xMax - blk->xMin); + xMax = blk->xMax - d2 * (blk->xMax - blk->xMin); + yMin = blk->yMin + d0 * (blk->yMax - blk->yMin); + yMax = blk->yMin + d1 * (blk->yMax - blk->yMin); + base = blk->xMax - d4 * (blk->xMax - blk->xMin); + break; + case 2: + xMin = blk->xMax - d1 * (blk->xMax - blk->xMin); + xMax = blk->xMax - d0 * (blk->xMax - blk->xMin); + yMin = blk->yMax - d3 * (blk->yMax - blk->yMin); + yMax = blk->yMax - d2 * (blk->yMax - blk->yMin); + base = blk->yMax - d4 * (blk->yMax - blk->yMin); + break; + case 3: + xMin = blk->xMin + d2 * (blk->xMax - blk->xMin); + xMax = blk->xMin + d3 * (blk->xMax - blk->xMin); + yMin = blk->yMax - d1 * (blk->yMax - blk->yMin); + yMax = blk->yMax - d0 * (blk->yMax - blk->yMin); + base = blk->xMin + d4 * (blk->xMax - blk->xMin); + break; + } + + } + } +} + +int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) { + TextLineFrag *frag1 = (TextLineFrag *)p1; + TextLineFrag *frag2 = (TextLineFrag *)p2; + double cmp; + + cmp = 0; // make gcc happy + switch (frag1->line->blk->page->primaryRot) { + case 0: + if ((cmp = frag1->yMin - frag2->yMin) == 0) { + cmp = frag1->xMin - frag2->xMin; + } + break; + case 1: + if ((cmp = frag2->xMax - frag1->xMax) == 0) { + cmp = frag1->yMin - frag2->yMin; + } + break; + case 2: + if ((cmp = frag2->yMin - frag1->yMin) == 0) { + cmp = frag2->xMax - frag1->xMax; + } + break; + case 3: + if ((cmp = frag1->xMax - frag2->xMax) == 0) { + cmp = frag2->yMax - frag1->yMax; + } + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) { + TextLineFrag *frag1 = (TextLineFrag *)p1; + TextLineFrag *frag2 = (TextLineFrag *)p2; + double cmp; + + cmp = 0; // make gcc happy + switch (frag1->line->rot) { + case 0: + if ((cmp = frag1->yMin - frag2->yMin) == 0) { + cmp = frag1->xMin - frag2->xMin; + } + break; + case 1: + if ((cmp = frag2->xMax - frag1->xMax) == 0) { + cmp = frag1->yMin - frag2->yMin; + } + break; + case 2: + if ((cmp = frag2->yMin - frag1->yMin) == 0) { + cmp = frag2->xMax - frag1->xMax; + } + break; + case 3: + if ((cmp = frag1->xMax - frag2->xMax) == 0) { + cmp = frag2->yMax - frag1->yMax; + } + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) { + TextLineFrag *frag1 = (TextLineFrag *)p1; + TextLineFrag *frag2 = (TextLineFrag *)p2; + double cmp; + + cmp = 0; // make gcc happy + switch (frag1->line->rot) { + case 0: + if ((cmp = frag1->xMin - frag2->xMin) == 0) { + cmp = frag1->yMin - frag2->yMin; + } + break; + case 1: + if ((cmp = frag1->yMin - frag2->yMin) == 0) { + cmp = frag2->xMax - frag1->xMax; + } + break; + case 2: + if ((cmp = frag2->xMax - frag1->xMax) == 0) { + cmp = frag2->yMin - frag1->yMin; + } + break; + case 3: + if ((cmp = frag2->yMax - frag1->yMax) == 0) { + cmp = frag1->xMax - frag2->xMax; + } + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } //------------------------------------------------------------------------ // TextBlock //------------------------------------------------------------------------ -TextBlock::TextBlock() { +TextBlock::TextBlock(TextPage *pageA, int rotA) { + page = pageA; + rot = rotA; + xMin = yMin = 0; + xMax = yMax = -1; + priMin = 0; + priMax = page->pageWidth; + pool = new TextPool(); lines = NULL; + curLine = NULL; next = NULL; + stackNext = NULL; } TextBlock::~TextBlock() { - TextLine *l1, *l2; + TextLine *line; - for (l1 = lines; l1; l1 = l2) { - l2 = l1->next; - delete l1; + delete pool; + while (lines) { + line = lines; + lines = lines->next; + delete line; } } -// Returns true if comes before in xy order, allowing -// slack for vertically overlapping blocks. -GBool TextBlock::yxBefore(TextBlock *blk2) { - double dy; +void TextBlock::addWord(TextWord *word) { + pool->addWord(word); + if (xMin > xMax) { + xMin = word->xMin; + xMax = word->xMax; + yMin = word->yMin; + yMax = word->yMax; + } else { + if (word->xMin < xMin) { + xMin = word->xMin; + } + if (word->xMax > xMax) { + xMax = word->xMax; + } + if (word->yMin < yMin) { + yMin = word->yMin; + } + if (word->yMax > yMax) { + yMax = word->yMax; + } + } +} - dy = blkOverlapSlack * lines->fontSize; +void TextBlock::coalesce(UnicodeMap *uMap) { + TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord; + TextLine *line, *line0, *line1; + int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx; + int baseIdx, bestWordBaseIdx, idx0, idx1; + double minBase, maxBase; + double fontSize, delta, priDelta, secDelta; + TextLine **lineArray; + GBool found; + int col1, col2; + int i, j, k; + + // discard duplicated text (fake boldface, drop shadows) + for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) { + word0 = pool->getPool(idx0); + while (word0) { + priDelta = dupMaxPriDelta * word0->fontSize; + secDelta = dupMaxSecDelta * word0->fontSize; + if (rot == 0 || rot == 3) { + maxBaseIdx = pool->getBaseIdx(word0->base + secDelta); + } else { + maxBaseIdx = pool->getBaseIdx(word0->base - secDelta); + } + found = gFalse; + word1 = word2 = NULL; // make gcc happy + for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) { + if (idx1 == idx0) { + word1 = word0; + word2 = word0->next; + } else { + word1 = NULL; + word2 = pool->getPool(idx1); + } + for (; word2; word1 = word2, word2 = word2->next) { + if (word2->len == word0->len && + !memcmp(word2->text, word0->text, + word0->len * sizeof(Unicode))) { + switch (rot) { + case 0: + case 2: + found = fabs(word0->xMin - word2->xMin) < priDelta && + fabs(word0->xMax - word2->xMax) < priDelta && + fabs(word0->yMin - word2->yMin) < secDelta && + fabs(word0->yMax - word2->yMax) < secDelta; + break; + case 1: + case 3: + found = fabs(word0->xMin - word2->xMin) < secDelta && + fabs(word0->xMax - word2->xMax) < secDelta && + fabs(word0->yMin - word2->yMin) < priDelta && + fabs(word0->yMax - word2->yMax) < priDelta; + break; + } + } + if (found) { + break; + } + } + if (found) { + break; + } + } + if (found) { + if (word1) { + word1->next = word2->next; + } else { + pool->setPool(idx1, word2->next); + } + delete word2; + } else { + word0 = word0->next; + } + } + } + + // build the lines + curLine = NULL; + poolMinBaseIdx = pool->minBaseIdx; + charCount = 0; + nLines = 0; + while (1) { - // non-overlapping case - if (blk2->yMin > yMax - dy || - blk2->yMax < yMin + dy) { - return yMin < blk2->yMin || - (yMin == blk2->yMin && xMin < blk2->xMin); + // find the first non-empty line in the pool + for (; + poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); + ++poolMinBaseIdx) ; + if (poolMinBaseIdx > pool->maxBaseIdx) { + break; + } + + // look for the left-most word in the first four lines of the + // pool -- this avoids starting with a superscript word + startBaseIdx = poolMinBaseIdx; + for (baseIdx = poolMinBaseIdx + 1; + baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; + ++baseIdx) { + if (!pool->getPool(baseIdx)) { + continue; + } + if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) + < 0) { + startBaseIdx = baseIdx; + } + } + + // create a new line + word0 = pool->getPool(startBaseIdx); + pool->setPool(startBaseIdx, word0->next); + word0->next = NULL; + line = new TextLine(this, word0->rot, word0->base); + line->addWord(word0); + lastWord = word0; + + // compute the search range + fontSize = word0->fontSize; + minBase = word0->base - maxIntraLineDelta * fontSize; + maxBase = word0->base + maxIntraLineDelta * fontSize; + minBaseIdx = pool->getBaseIdx(minBase); + maxBaseIdx = pool->getBaseIdx(maxBase); + + // find the rest of the words in this line + while (1) { + + // find the left-most word whose baseline is in the range for + // this line + bestWordBaseIdx = 0; + bestWord0 = bestWord1 = NULL; + for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { + for (word0 = NULL, word1 = pool->getPool(baseIdx); + word1; + word0 = word1, word1 = word1->next) { + if (word1->base >= minBase && + word1->base <= maxBase && + (delta = lastWord->primaryDelta(word1)) >= + minCharSpacing * fontSize) { + if (delta < maxWordSpacing * fontSize && + (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) { + bestWordBaseIdx = baseIdx; + bestWord0 = word0; + bestWord1 = word1; + } + break; + } + } + } + if (!bestWord1) { + break; + } + + // remove it from the pool, and add it to the line + if (bestWord0) { + bestWord0->next = bestWord1->next; + } else { + pool->setPool(bestWordBaseIdx, bestWord1->next); + } + bestWord1->next = NULL; + line->addWord(bestWord1); + lastWord = bestWord1; + } + + // add the line + if (curLine && line->cmpYX(curLine) > 0) { + line0 = curLine; + line1 = curLine->next; + } else { + line0 = NULL; + line1 = lines; + } + for (; + line1 && line->cmpYX(line1) > 0; + line0 = line1, line1 = line1->next) ; + if (line0) { + line0->next = line; + } else { + lines = line; + } + line->next = line1; + curLine = line; + line->coalesce(uMap); + charCount += line->len; + ++nLines; + } + + // sort lines into xy order for column assignment + lineArray = (TextLine **)gmalloc(nLines * sizeof(TextLine *)); + for (line = lines, i = 0; line; line = line->next, ++i) { + lineArray[i] = line; } + qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY); - // overlapping case - return xMin < blk2->xMin; + // column assignment + nColumns = 0; + for (i = 0; i < nLines; ++i) { + line0 = lineArray[i]; + col1 = 0; + for (j = 0; j < i; ++j) { + line1 = lineArray[j]; + if (line1->primaryDelta(line0) >= 0) { + col2 = line1->col[line1->len] + 1; + } else { + k = 0; // make gcc happy + switch (rot) { + case 0: + for (k = 0; + k < line1->len && + line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + case 1: + for (k = 0; + k < line1->len && + line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + case 2: + for (k = 0; + k < line1->len && + line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + case 3: + for (k = 0; + k < line1->len && + line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + } + col2 = line1->col[k]; + } + if (col2 > col1) { + col1 = col2; + } + } + for (k = 0; k <= line0->len; ++k) { + line0->col[k] += col1; + } + if (line0->col[line0->len] > nColumns) { + nColumns = line0->col[line0->len]; + } + } + gfree(lineArray); } -// Merge another block's line onto the right of this one. -void TextBlock::mergeRight(TextBlock *blk2) { - lines->merge(blk2->lines); - xMax = lines->xMax; - yMin = lines->yMin; - yMax = lines->yMax; - xSpaceR = lines->xSpaceR; +void TextBlock::updatePriMinMax(TextBlock *blk) { + double newPriMin, newPriMax; + GBool gotPriMin, gotPriMax; + + gotPriMin = gotPriMax = gFalse; + newPriMin = newPriMax = 0; // make gcc happy + switch (page->primaryRot) { + case 0: + case 2: + if (blk->yMin < yMax && blk->yMax > yMin) { + if (blk->xMin < xMin) { + newPriMin = blk->xMax; + gotPriMin = gTrue; + } + if (blk->xMax > xMax) { + newPriMax = blk->xMin; + gotPriMax = gTrue; + } + } + break; + case 1: + case 3: + if (blk->xMin < xMax && blk->xMax > xMin) { + if (blk->yMin < yMin) { + newPriMin = blk->yMax; + gotPriMin = gTrue; + } + if (blk->yMax > yMax) { + newPriMax = blk->yMin; + gotPriMax = gTrue; + } + } + break; + } + if (gotPriMin) { + if (newPriMin > xMin) { + newPriMin = xMin; + } + if (newPriMin > priMin) { + priMin = newPriMin; + } + } + if (gotPriMax) { + if (newPriMax < xMax) { + newPriMax = xMax; + } + if (newPriMax < priMax) { + priMax = newPriMax; + } + } } -// Merge another block's lines onto the bottom of this block. -void TextBlock::mergeBelow(TextBlock *blk2) { - TextLine *line; +int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) { + TextBlock *blk1 = *(TextBlock **)p1; + TextBlock *blk2 = *(TextBlock **)p2; + double cmp; - if (blk2->xMin < xMin) { - xMin = blk2->xMin; + cmp = 0; // make gcc happy + switch (blk1->page->primaryRot) { + case 0: + if ((cmp = blk1->xMin - blk2->xMin) == 0) { + cmp = blk1->yMin - blk2->yMin; + } + break; + case 1: + if ((cmp = blk1->yMin - blk2->yMin) == 0) { + cmp = blk2->xMax - blk1->xMax; + } + break; + case 2: + if ((cmp = blk2->xMax - blk1->xMax) == 0) { + cmp = blk2->yMin - blk1->yMin; + } + break; + case 3: + if ((cmp = blk2->yMax - blk1->yMax) == 0) { + cmp = blk1->xMax - blk2->xMax; + } + break; } - if (blk2->xMax > xMax) { - xMax = blk2->xMax; + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) { + TextBlock *blk1 = *(TextBlock **)p1; + TextBlock *blk2 = *(TextBlock **)p2; + double cmp; + + cmp = 0; // make gcc happy + switch (blk1->page->primaryRot) { + case 0: + if ((cmp = blk1->yMin - blk2->yMin) == 0) { + cmp = blk1->xMin - blk2->xMin; + } + break; + case 1: + if ((cmp = blk2->xMax - blk1->xMax) == 0) { + cmp = blk1->yMin - blk2->yMin; + } + break; + case 2: + if ((cmp = blk2->yMin - blk1->yMin) == 0) { + cmp = blk2->xMax - blk1->xMax; + } + break; + case 3: + if ((cmp = blk1->xMax - blk2->xMax) == 0) { + cmp = blk2->yMax - blk1->yMax; + } + break; } - yMax = blk2->yMax; - if (blk2->xSpaceL > xSpaceL) { - xSpaceL = blk2->xSpaceL; + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextBlock::primaryCmp(TextBlock *blk) { + double cmp; + + cmp = 0; // make gcc happy + switch (rot) { + case 0: + cmp = xMin - blk->xMin; + break; + case 1: + cmp = yMin - blk->yMin; + break; + case 2: + cmp = blk->xMax - xMax; + break; + case 3: + cmp = blk->yMax - yMax; + break; } - if (blk2->xSpaceR < xSpaceR) { - xSpaceR = blk2->xSpaceR; + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +double TextBlock::secondaryDelta(TextBlock *blk) { + double delta; + + delta = 0; // make gcc happy + switch (rot) { + case 0: + delta = blk->yMin - yMax; + break; + case 1: + delta = xMin - blk->xMax; + break; + case 2: + delta = yMin - blk->yMax; + break; + case 3: + delta = blk->xMin - xMax; + break; } - if (blk2->maxFontSize > maxFontSize) { - maxFontSize = blk2->maxFontSize; + return delta; +} + +GBool TextBlock::isBelow(TextBlock *blk) { + GBool below; + + below = gFalse; // make gcc happy + switch (page->primaryRot) { + case 0: + below = xMin >= blk->priMin && xMax <= blk->priMax && + yMin > blk->yMin; + break; + case 1: + below = yMin >= blk->priMin && yMax <= blk->priMax && + xMax < blk->xMax; + break; + case 2: + below = xMin >= blk->priMin && xMax <= blk->priMax && + yMax < blk->yMax; + break; + case 3: + below = yMin >= blk->priMin && yMax <= blk->priMax && + xMin > blk->xMin; + break; } - for (line = lines; line->next; line = line->next) ; - line->next = line->flowNext = blk2->lines; - blk2->lines = NULL; + + return below; } //------------------------------------------------------------------------ // TextFlow //------------------------------------------------------------------------ -TextFlow::TextFlow() { - blocks = NULL; +TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) { + page = pageA; + xMin = blk->xMin; + xMax = blk->xMax; + yMin = blk->yMin; + yMax = blk->yMax; + priMin = blk->priMin; + priMax = blk->priMax; + blocks = lastBlk = blk; next = NULL; } TextFlow::~TextFlow() { - TextBlock *b1, *b2; + TextBlock *blk; - for (b1 = blocks; b1; b1 = b2) { - b2 = b1->next; - delete b1; + while (blocks) { + blk = blocks; + blocks = blocks->next; + delete blk; } } +void TextFlow::addBlock(TextBlock *blk) { + if (lastBlk) { + lastBlk->next = blk; + } else { + blocks = blk; + } + lastBlk = blk; + if (blk->xMin < xMin) { + xMin = blk->xMin; + } + if (blk->xMax > xMax) { + xMax = blk->xMax; + } + if (blk->yMin < yMin) { + yMin = blk->yMin; + } + if (blk->yMax > yMax) { + yMax = blk->yMax; + } +} + +GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) { + GBool fits; + + // lower blocks must use smaller fonts + if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) { + return gFalse; + } + + fits = gFalse; // make gcc happy + switch (page->primaryRot) { + case 0: + fits = blk->xMin >= priMin && blk->xMax <= priMax; + break; + case 1: + fits = blk->yMin >= priMin && blk->yMax <= priMax; + break; + case 2: + fits = blk->xMin >= priMin && blk->xMax <= priMax; + break; + case 3: + fits = blk->yMin >= priMin && blk->yMax <= priMax; + break; + } + return fits; +} + +#if TEXTOUT_WORD_LIST + +//------------------------------------------------------------------------ +// TextWordList +//------------------------------------------------------------------------ + +TextWordList::TextWordList(TextPage *text, GBool physLayout) { + TextFlow *flow; + TextBlock *blk; + TextLine *line; + TextWord *word; + TextWord **wordArray; + int nWords, i; + + words = new GList(); + + if (text->rawOrder) { + for (word = text->rawWords; word; word = word->next) { + words->append(word); + } + + } else if (physLayout) { + // this is inefficient, but it's also the least useful of these + // three cases + nWords = 0; + for (flow = text->flows; flow; flow = flow->next) { + for (blk = flow->blocks; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + for (word = line->words; word; word = word->next) { + ++nWords; + } + } + } + } + wordArray = (TextWord **)gmalloc(nWords * sizeof(TextWord *)); + i = 0; + for (flow = text->flows; flow; flow = flow->next) { + for (blk = flow->blocks; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + for (word = line->words; word; word = word->next) { + wordArray[i++] = word; + } + } + } + } + qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX); + for (i = 0; i < nWords; ++i) { + words->append(wordArray[i]); + } + gfree(wordArray); + + } else { + for (flow = text->flows; flow; flow = flow->next) { + for (blk = flow->blocks; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + for (word = line->words; word; word = word->next) { + words->append(word); + } + } + } + } + } +} + +TextWordList::~TextWordList() { + delete words; +} + +int TextWordList::getLength() { + return words->getLength(); +} + +TextWord *TextWordList::get(int idx) { + if (idx < 0 || idx >= words->getLength()) { + return NULL; + } + return (TextWord *)words->get(idx); +} + +#endif // TEXTOUT_WORD_LIST //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ TextPage::TextPage(GBool rawOrderA) { + int rot; + rawOrder = rawOrderA; curWord = NULL; charPos = 0; - font = NULL; - fontSize = 0; + curFont = NULL; + curFontSize = 0; nest = 0; nTinyChars = 0; - words = wordPtr = NULL; - lines = NULL; + lastCharOverlap = gFalse; + if (!rawOrder) { + for (rot = 0; rot < 4; ++rot) { + pools[rot] = new TextPool(); + } + } flows = NULL; + blocks = NULL; + rawWords = NULL; + rawLastWord = NULL; fonts = new GList(); + lastFindXMin = lastFindYMin = 0; + haveLastFind = gFalse; } TextPage::~TextPage() { + int rot; + clear(); + if (!rawOrder) { + for (rot = 0; rot < 4; ++rot) { + delete pools[rot]; + } + } delete fonts; } +void TextPage::startPage(GfxState *state) { + clear(); + if (state) { + pageWidth = state->getPageWidth(); + pageHeight = state->getPageHeight(); + } else { + pageWidth = pageHeight = 0; + } +} + +void TextPage::endPage() { + if (curWord) { + endWord(); + } +} + +void TextPage::clear() { + int rot; + TextFlow *flow; + TextWord *word; + + if (curWord) { + delete curWord; + curWord = NULL; + } + if (rawOrder) { + while (rawWords) { + word = rawWords; + rawWords = rawWords->next; + delete word; + } + } else { + for (rot = 0; rot < 4; ++rot) { + delete pools[rot]; + } + while (flows) { + flow = flows; + flows = flows->next; + delete flow; + } + gfree(blocks); + } + deleteGList(fonts, TextFontInfo); + + curWord = NULL; + charPos = 0; + curFont = NULL; + curFontSize = 0; + nest = 0; + nTinyChars = 0; + if (!rawOrder) { + for (rot = 0; rot < 4; ++rot) { + pools[rot] = new TextPool(); + } + } + flows = NULL; + blocks = NULL; + rawWords = NULL; + rawLastWord = NULL; + fonts = new GList(); +} + void TextPage::updateFont(GfxState *state) { GfxFont *gfxFont; double *fm; @@ -454,22 +1697,22 @@ void TextPage::updateFont(GfxState *state) { int i; // get the font info object - font = NULL; + curFont = NULL; for (i = 0; i < fonts->getLength(); ++i) { - font = (TextFontInfo *)fonts->get(i); - if (font->matches(state)) { + curFont = (TextFontInfo *)fonts->get(i); + if (curFont->matches(state)) { break; } - font = NULL; + curFont = NULL; } - if (!font) { - font = new TextFontInfo(state); - fonts->append(font); + if (!curFont) { + curFont = new TextFontInfo(state); + fonts->append(curFont); } // adjust the font size gfxFont = state->getFont(); - fontSize = state->getTransformedFontSize(); + curFontSize = state->getTransformedFontSize(); if (gfxFont && gfxFont->getType() == fontType3) { // This is a hack which makes it possible to deal with some Type 3 // fonts. The problem is that it's impossible to know what the @@ -496,40 +1739,68 @@ void TextPage::updateFont(GfxState *state) { if (mCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) { // 0.6 is a generic average 'm' width -- yes, this is a hack - fontSize *= w / 0.6; + curFontSize *= w / 0.6; } else if (letterCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) { // even more of a hack: 0.5 is a generic letter width - fontSize *= w / 0.5; + curFontSize *= w / 0.5; } else if (anyCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) { // better than nothing: 0.5 is a generic character width - fontSize *= w / 0.5; + curFontSize *= w / 0.5; } fm = gfxFont->getFontMatrix(); if (fm[0] != 0) { - fontSize *= fabs(fm[3] / fm[0]); + curFontSize *= fabs(fm[3] / fm[0]); } } } void TextPage::beginWord(GfxState *state, double x0, double y0) { + double *txtm, *ctm, *fontm; + double m[4], m2[4]; + int rot; + // This check is needed because Type 3 characters can contain // text-drawing operations (when TextPage is being used via - // XOutputDev rather than TextOutputDev). + // {X,Win}SplashOutputDev rather than TextOutputDev). if (curWord) { ++nest; return; } - curWord = new TextWord(state, x0, y0, charPos, font, fontSize); + // compute the rotation + txtm = state->getTextMat(); + ctm = state->getCTM(); + m[0] = txtm[0] * ctm[0] + txtm[1] * ctm[2]; + m[1] = txtm[0] * ctm[1] + txtm[1] * ctm[3]; + m[2] = txtm[2] * ctm[0] + txtm[3] * ctm[2]; + m[3] = txtm[2] * ctm[1] + txtm[3] * ctm[3]; + if (state->getFont()->getType() == fontType3) { + fontm = state->getFont()->getFontMatrix(); + m2[0] = fontm[0] * m[0] + fontm[1] * m[2]; + m2[1] = fontm[0] * m[1] + fontm[1] * m[3]; + m2[2] = fontm[2] * m[0] + fontm[3] * m[2]; + m2[3] = fontm[2] * m[1] + fontm[3] * m[3]; + m[0] = m2[0]; + m[1] = m2[1]; + m[2] = m2[2]; + m[3] = m2[3]; + } + if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) { + rot = (m[3] < 0) ? 0 : 2; + } else { + rot = (m[2] > 0) ? 1 : 3; + } + + curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize); } void TextPage::addChar(GfxState *state, double x, double y, double dx, double dy, CharCode c, Unicode *u, int uLen) { - double x1, y1, w1, h1, dx2, dy2, sp; - int n, i; + double x1, y1, w1, h1, dx2, dy2, base, sp; + int i; // if the previous char was a space, addChar will have called // endWord, so we need to start a new word @@ -557,7 +1828,7 @@ void TextPage::addChar(GfxState *state, double x, double y, // check the tiny chars limit if (!globalParams->getTextKeepTinyChars() && fabs(w1) < 3 && fabs(h1) < 3) { - if (++nTinyChars > 20000) { + if (++nTinyChars > 50000) { return; } } @@ -570,20 +1841,55 @@ void TextPage::addChar(GfxState *state, double x, double y, return; } - // large char spacing is sometimes used to move text around -- in - // this case, break text into individual chars and let the coalesce - // function deal with it later - n = curWord->len; - if (n > 0 && x1 - curWord->xRight[n-1] > - curWord->font->minSpaceWidth * curWord->fontSize) { - endWord(); - beginWord(state, x, y); + // start a new word if: + // (1) this character's baseline doesn't match the current word's + // baseline, or + // (2) there is space between the end of the current word and this + // character, or + // (3) this character overlaps the previous one (duplicated text), or + // (4) the previous character was an overlap (we want each duplicated + // characters to be in a word by itself) + base = sp = 0; // make gcc happy + if (curWord->len > 0) { + switch (curWord->rot) { + case 0: + base = y1; + sp = x1 - curWord->xMax; + break; + case 1: + base = x1; + sp = y1 - curWord->yMax; + break; + case 2: + base = y1; + sp = curWord->xMin - x1; + break; + case 3: + base = x1; + sp = curWord->yMin - y1; + break; + } + if (fabs(base - curWord->base) > 0.5 || + sp > minWordBreakSpace * curWord->fontSize || + sp < -minDupBreakOverlap * curWord->fontSize || + lastCharOverlap) { + lastCharOverlap = gTrue; + endWord(); + beginWord(state, x, y); + } else { + lastCharOverlap = gFalse; + } + } else { + lastCharOverlap = gFalse; } // page rotation and/or transform matrices can cause text to be // drawn in reverse order -- in this case, swap the begin/end // coordinates and break text into individual chars - if (w1 < 0) { + if ((curWord->rot == 0 && w1 < 0) || + (curWord->rot == 1 && h1 < 0) || + (curWord->rot == 2 && w1 > 0) || + (curWord->rot == 3 && h1 > 0)) { endWord(); beginWord(state, x + dx, y + dy); x1 += w1; @@ -607,7 +1913,7 @@ void TextPage::addChar(GfxState *state, double x, double y, void TextPage::endWord() { // This check is needed because Type 3 characters can contain // text-drawing operations (when TextPage is being used via - // XOutputDev rather than TextOutputDev). + // {X,Win}SplashOutputDev rather than TextOutputDev). if (nest > 0) { --nest; return; @@ -620,8 +1926,6 @@ void TextPage::endWord() { } void TextPage::addWord(TextWord *word) { - TextWord *p1, *p2; - // throw away zero-length words -- they don't have valid xMin/xMax // values, and they're useless anyway if (word->len == 0) { @@ -629,533 +1933,402 @@ void TextPage::addWord(TextWord *word) { return; } - // insert word in xy list if (rawOrder) { - p1 = wordPtr; - p2 = NULL; - } else { - if (wordPtr && wordPtr->xyBefore(word)) { - p1 = wordPtr; - p2 = wordPtr->next; + if (rawLastWord) { + rawLastWord->next = word; } else { - p1 = NULL; - p2 = words; - } - for (; p2; p1 = p2, p2 = p2->next) { - if (word->xyBefore(p2)) { - break; - } + rawWords = word; } - } - if (p1) { - p1->next = word; + rawLastWord = word; } else { - words = word; + pools[word->rot]->addWord(word); } - word->next = p2; - wordPtr = word; } void TextPage::coalesce(GBool physLayout) { + UnicodeMap *uMap; + TextPool *pool; TextWord *word0, *word1, *word2; - TextLine *line0, *line1, *line2, *line3, *line4, *lineList; - TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6; - TextBlock *yxBlocks, *blocks, *blkStack; - TextFlow *flow0, *flow1; - double sz, xLimit, yLimit; - double fit1, fit2, sp1, sp2; + TextLine *line; + TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1; + TextBlock **blkArray; + TextFlow *flow, *lastFlow; + int rot, poolMinBaseIdx, baseIdx, startBaseIdx; + double minBase, maxBase, newMinBase, newMaxBase; + double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace; GBool found; - UnicodeMap *uMap; - GBool isUnicode; - char buf[8]; - int col1, col2, d, i, j; + int count[4]; + int lrCount; + int firstBlkIdx, nBlocksLeft; + int col1, col2; + int i, j, n; -#if 0 // for debugging - printf("*** initial word list ***\n"); - for (word0 = words; word0; word0 = word0->next) { - printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", - word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); - for (i = 0; i < word0->len; ++i) { - fputc(word0->text[i] & 0xff, stdout); - } - printf("'\n"); + if (rawOrder) { + primaryRot = 0; + primaryLR = gTrue; + return; } - printf("\n"); - fflush(stdout); -#endif - //----- discard duplicated text (fake boldface, drop shadows) - - word0 = words; - while (word0) { - sz = word0->fontSize; - xLimit = word0->xMin + sz * dupMaxDeltaX; - found = gFalse; - for (word1 = word0, word2 = word0->next; - word2 && word2->xMin < xLimit; - word1 = word2, word2 = word2->next) { - if (word2->len == word0->len && - !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) && - fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY && - fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY && - fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) { - found = gTrue; - break; - } - } - if (found) { - word1->next = word2->next; - delete word2; - } else { - word0 = word0->next; - } - } + uMap = globalParams->getTextEncoding(); + blkList = NULL; + lastBlk = NULL; + nBlocks = 0; + primaryRot = -1; #if 0 // for debugging - printf("*** words after removing duplicate text ***\n"); - for (word0 = words; word0; word0 = word0->next) { - printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", - word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); - for (i = 0; i < word0->len; ++i) { - fputc(word0->text[i] & 0xff, stdout); - } - printf("'\n"); - } - printf("\n"); - fflush(stdout); -#endif - - //----- merge words - - word0 = words; - while (word0) { - sz = word0->fontSize; - - // look for adjacent text which is part of the same word, and - // merge it into this word - xLimit = word0->xMax + sz * word0->font->minSpaceWidth; - if (rawOrder) { - word1 = word0; - word2 = word0->next; - found = word2 && - word2->xMin < xLimit && - word2->font == word0->font && - fabs(word2->fontSize - sz) < 0.05 && - fabs(word2->yBase - word0->yBase) < 0.05 && - word2->charPos == word0->charPos + word0->charLen; - } else { - found = gFalse; - for (word1 = word0, word2 = word0->next; - word2 && word2->xMin < xLimit; - word1 = word2, word2 = word2->next) { - if (word2->font == word0->font && - fabs(word2->fontSize - sz) < 0.05 && - fabs(word2->yBase - word0->yBase) < 0.05 && - word2->charPos == word0->charPos + word0->charLen) { - found = gTrue; - break; + printf("*** initial words ***\n"); + for (rot = 0; rot < 4; ++rot) { + pool = pools[rot]; + for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) { + for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->base, word0->fontSize); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); } + printf("'\n"); } } - if (found) { - word0->merge(word2); - word1->next = word2->next; - delete word2; - continue; - } - - word0 = word0->next; - } - -#if 0 // for debugging - printf("*** after merging words ***\n"); - for (word0 = words; word0; word0 = word0->next) { - printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", - word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); - for (i = 0; i < word0->len; ++i) { - fputc(word0->text[i] & 0xff, stdout); - } - printf("'\n"); } printf("\n"); - fflush(stdout); #endif - //----- assemble words into lines + //----- assemble the blocks - lineList = line0 = NULL; - while (words) { - - // remove the first word from the word list - word0 = words; - words = words->next; - word0->next = NULL; - - // find the best line (if any) for the word - if (rawOrder) { - if (line0 && lineFit(line0, word0, &sp2) >= 0) { - line1 = line0; - sp1 = sp2; - } else { - line1 = NULL; - sp1 = 0; - } - } else { - line1 = NULL; - fit1 = 0; - sp1 = 0; - for (line2 = lineList; line2; line2 = line2->next) { - fit2 = lineFit(line2, word0, &sp2); - if (fit2 >= 0 && (!line1 || fit2 < fit1)) { - line1 = line2; - fit1 = fit2; - sp1 = sp2; - } - } - } - - // found a line: append the word - if (line1) { - word1 = line1->lastWord; - word1->next = word0; - line1->lastWord = word0; - if (word0->xMax > line1->xMax) { - line1->xMax = word0->xMax; - } - if (word0->yMin < line1->yMin) { - line1->yMin = word0->yMin; - } - if (word0->yMax > line1->yMax) { - line1->yMax = word0->yMax; - } - line1->len += word0->len; - if (sp1 > line1->fontSize * line1->font->minSpaceWidth) { - word1->spaceAfter = gTrue; - ++line1->len; - } - - // didn't find a line: create a new line - } else { - line1 = new TextLine(); - line1->words = line1->lastWord = word0; - line1->xMin = word0->xMin; - line1->xMax = word0->xMax; - line1->yMin = word0->yMin; - line1->yMax = word0->yMax; - line1->yBase = word0->yBase; - line1->font = word0->font; - line1->fontSize = word0->fontSize; - line1->len = word0->len; - if (line0) { - line0->next = line1; - } else { - lineList = line1; - } - line0 = line1; - } - } - - // build the line text - uMap = globalParams->getTextEncoding(); - isUnicode = uMap ? uMap->isUnicode() : gFalse; - - for (line1 = lineList; line1; line1 = line1->next) { - line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode)); - line1->xRight = (double *)gmalloc(line1->len * sizeof(double)); - line1->col = (int *)gmalloc(line1->len * sizeof(int)); - i = 0; - for (word1 = line1->words; word1; word1 = word1->next) { - for (j = 0; j < word1->len; ++j) { - line1->text[i] = word1->text[j]; - line1->xRight[i] = word1->xRight[j]; - ++i; - } - if (word1->spaceAfter && word1->next) { - line1->text[i] = (Unicode)0x0020; - line1->xRight[i] = word1->next->xMin; - ++i; - } - } - line1->convertedLen = 0; - for (j = 0; j < line1->len; ++j) { - line1->col[j] = line1->convertedLen; - if (isUnicode) { - ++line1->convertedLen; - } else if (uMap) { - line1->convertedLen += - uMap->mapUnicode(line1->text[j], buf, sizeof(buf)); - } - } - - // check for hyphen at end of line - //~ need to check for other chars used as hyphens - if (line1->text[line1->len - 1] == (Unicode)'-') { - line1->hyphenated = gTrue; - } + //~ add an outer loop for writing mode (vertical text) - } + // build blocks for each rotation value + for (rot = 0; rot < 4; ++rot) { + pool = pools[rot]; + poolMinBaseIdx = pool->minBaseIdx; + count[rot] = 0; - if (uMap) { - uMap->decRefCnt(); - } + // add blocks until no more words are left + while (1) { -#if 0 // for debugging - printf("*** lines in xy order ***\n"); - for (line0 = lineList; line0; line0 = line0->next) { - printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", - line0->xMin, line0->xMax, line0->yMin, line0->yMax, - line0->yBase, line0->len); - for (word0 = line0->words; word0; word0 = word0->next) { - printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", - word0->xMin, word0->xMax, word0->yMin, word0->yMax, - word0->yBase, word0->fontSize, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { - fputc(word0->text[i] & 0xff, stdout); + // find the first non-empty line in the pool + for (; + poolMinBaseIdx <= pool->maxBaseIdx && + !pool->getPool(poolMinBaseIdx); + ++poolMinBaseIdx) ; + if (poolMinBaseIdx > pool->maxBaseIdx) { + break; } - printf("'\n"); - } - } - printf("\n"); - fflush(stdout); -#endif - - //----- column assignment - for (line1 = lineList; line1; line1 = line1->next) { - col1 = 0; - for (line2 = lineList; line2 != line1; line2 = line2->next) { - if (line1->xMin >= line2->xMax) { - d = (int)((line1->xMin - line2->xMax) / - (line1->font->maxSpaceWidth * line1->fontSize)); - if (d > 4) { - d = 4; - } - col2 = line2->col[0] + line2->convertedLen + d; - if (col2 > col1) { - col1 = col2; + // look for the left-most word in the first four lines of the + // pool -- this avoids starting with a superscript word + startBaseIdx = poolMinBaseIdx; + for (baseIdx = poolMinBaseIdx + 1; + baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; + ++baseIdx) { + if (!pool->getPool(baseIdx)) { + continue; } - } else if (line1->xMin > line2->xMin) { - for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ; - col2 = line2->col[i]; - if (col2 > col1) { - col1 = col2; + if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) + < 0) { + startBaseIdx = baseIdx; } } - } - for (j = 0; j < line1->len; ++j) { - line1->col[j] += col1; - } - } - -#if 0 // for debugging - printf("*** lines in xy order, after column assignment ***\n"); - for (line0 = lineList; line0; line0 = line0->next) { - printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f col=%d len=%d]\n", - line0->xMin, line0->xMax, line0->yMin, line0->yMax, - line0->yBase, line0->col[0], line0->len); - for (word0 = line0->words; word0; word0 = word0->next) { - printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", - word0->xMin, word0->xMax, word0->yMin, word0->yMax, - word0->yBase, word0->fontSize, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { - fputc(word0->text[i] & 0xff, stdout); - } - printf("'\n"); - } - } - printf("\n"); - fflush(stdout); -#endif - - //----- assemble lines into blocks - - if (rawOrder) { - - lines = lineList; - for (line1 = lines; line1; line1 = line1->next) { - line1->xSpaceL = 0; - line1->xSpaceR = pageWidth; - } - - } else { - // sort lines into yx order - lines = NULL; - while (lineList) { - line0 = lineList; - lineList = lineList->next; - for (line1 = NULL, line2 = lines; - line2 && !line0->yxBefore(line2); - line1 = line2, line2 = line2->next) ; - if (line1) { - line1->next = line0; - } else { - lines = line0; - } - line0->next = line2; - } - - // compute whitespace to left and right of each line - line0 = lines; - for (line1 = lines; line1; line1 = line1->next) { - - // find the first vertically overlapping line - for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ; - - // check each vertically overlapping line -- look for the nearest - // on each side - line1->xSpaceL = 0; - line1->xSpaceR = pageWidth; - for (line2 = line0; - line2 && line2->yMin < line1->yMax; - line2 = line2->next) { - if (line2->yMax > line1->yMin) { - if (line2->xMax < line1->xMin) { - if (line2->xMax > line1->xSpaceL) { - line1->xSpaceL = line2->xMax; - } - } else if (line2->xMin > line1->xMax) { - if (line2->xMin < line1->xSpaceR) { - line1->xSpaceR = line2->xMin; + // create a new block + word0 = pool->getPool(startBaseIdx); + pool->setPool(startBaseIdx, word0->next); + word0->next = NULL; + blk = new TextBlock(this, rot); + blk->addWord(word0); + + fontSize = word0->fontSize; + minBase = maxBase = word0->base; + colSpace1 = minColSpacing1 * fontSize; + colSpace2 = minColSpacing2 * fontSize; + lineSpace = maxLineSpacingDelta * fontSize; + intraLineSpace = maxIntraLineDelta * fontSize; + + // add words to the block + do { + found = gFalse; + + // look for words on the line above the current top edge of + // the block + newMinBase = minBase; + for (baseIdx = pool->getBaseIdx(minBase); + baseIdx >= pool->getBaseIdx(minBase - lineSpace); + --baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base < minBase && + word1->base >= minBase - lineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) + : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta1 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + found = gTrue; + newMinBase = word2->base; + } else { + word0 = word1; + word1 = word1->next; } } } - } - } - } // (!rawOrder) - -#if 0 // for debugging - printf("*** lines in yx order ***\n"); - for (line0 = lines; line0; line0 = line0->next) { - printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n", - line0->xMin, line0->xMax, line0->yMin, line0->yMax, - line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len); - for (word0 = line0->words; word0; word0 = word0->next) { - printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", - word0->xMin, word0->xMax, word0->yMin, word0->yMax, - word0->yBase, word0->fontSize, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { - fputc(word0->text[i] & 0xff, stdout); - } - printf("'\n"); - } - } - printf("\n"); - fflush(stdout); -#endif - - lineList = lines; - yxBlocks = NULL; - blk0 = NULL; - while (lineList) { - - // build a new block object - line0 = lineList; - lineList = lineList->next; - line0->next = NULL; - blk1 = new TextBlock(); - blk1->lines = line0; - blk1->xMin = line0->xMin; - blk1->xMax = line0->xMax; - blk1->yMin = line0->yMin; - blk1->yMax = line0->yMax; - blk1->xSpaceL = line0->xSpaceL; - blk1->xSpaceR = line0->xSpaceR; - blk1->maxFontSize = line0->fontSize; - - // find subsequent lines in the block - while (lineList) { - - // look for the first horizontally overlapping line below this - // one - yLimit = line0->yMax + blkMaxSpacing * line0->fontSize; - line3 = line4 = NULL; - if (rawOrder) { - if (lineList->yMin < yLimit && - lineList->xMax > blk1->xMin && - lineList->xMin < blk1->xMax) { - line3 = NULL; - line4 = lineList; + minBase = newMinBase; + + // look for words on the line below the current bottom edge of + // the block + newMaxBase = maxBase; + for (baseIdx = pool->getBaseIdx(maxBase); + baseIdx <= pool->getBaseIdx(maxBase + lineSpace); + ++baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base > maxBase && + word1->base <= maxBase + lineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) + : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta1 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + found = gTrue; + newMaxBase = word2->base; + } else { + word0 = word1; + word1 = word1->next; + } + } } - } else { - for (line1 = NULL, line2 = lineList; - line2 && line2->yMin < yLimit; - line1 = line2, line2 = line2->next) { - if (line2->xMax > blk1->xMin && - line2->xMin < blk1->xMax) { - line3 = line1; - line4 = line2; - break; + maxBase = newMaxBase; + + // look for words that are on lines already in the block, and + // that overlap the block horizontally + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin < blk->xMax + colSpace1 && + word1->xMax > blk->xMin - colSpace1) + : (word1->yMin < blk->yMax + colSpace1 && + word1->yMax > blk->yMin - colSpace1)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta2 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + found = gTrue; + } else { + word0 = word1; + word1 = word1->next; + } } } - } - // if there is an overlapping line and it fits in the block, add - // it to the block - if (line4 && blockFit(blk1, line4)) { - if (line3) { - line3->next = line4->next; - } else { - lineList = line4->next; - } - line0->next = line0->flowNext = line4; - line4->next = NULL; - if (line4->xMin < blk1->xMin) { - blk1->xMin = line4->xMin; - } else if (line4->xMax > blk1->xMax) { - blk1->xMax = line4->xMax; + // only check for outlying words (the next two chunks of code) + // if we didn't find anything else + if (found) { + continue; } - if (line4->yMax > blk1->yMax) { - blk1->yMax = line4->yMax; + + // scan down the left side of the block, looking for words + // that are near (but not overlapping) the block; if there are + // three or fewer, add them to the block + n = 0; + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMax <= blk->xMin && + word1->xMax > blk->xMin - colSpace2) + : (word1->yMax <= blk->yMin && + word1->yMax > blk->yMin - colSpace2)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta3 * fontSize) { + ++n; + break; + } + word1 = word1->next; + } } - if (line4->xSpaceL > blk1->xSpaceL) { - blk1->xSpaceL = line4->xSpaceL; + if (n > 0 && n <= 3) { + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMax <= blk->xMin && + word1->xMax > blk->xMin - colSpace2) + : (word1->yMax <= blk->yMin && + word1->yMax > blk->yMin - colSpace2)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta3 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + if (word2->base < minBase) { + minBase = word2->base; + } else if (word2->base > maxBase) { + maxBase = word2->base; + } + found = gTrue; + break; + } else { + word0 = word1; + word1 = word1->next; + } + } + } } - if (line4->xSpaceR < blk1->xSpaceR) { - blk1->xSpaceR = line4->xSpaceR; + + // scan down the right side of the block, looking for words + // that are near (but not overlapping) the block; if there are + // three or fewer, add them to the block + n = 0; + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin >= blk->xMax && + word1->xMin < blk->xMax + colSpace2) + : (word1->yMin >= blk->yMax && + word1->yMin < blk->yMax + colSpace2)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta3 * fontSize) { + ++n; + break; + } + word1 = word1->next; + } } - if (line4->fontSize > blk1->maxFontSize) { - blk1->maxFontSize = line4->fontSize; + if (n > 0 && n <= 3) { + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin >= blk->xMax && + word1->xMin < blk->xMax + colSpace2) + : (word1->yMin >= blk->yMax && + word1->yMin < blk->yMax + colSpace2)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta3 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + if (word2->base < minBase) { + minBase = word2->base; + } else if (word2->base > maxBase) { + maxBase = word2->base; + } + found = gTrue; + break; + } else { + word0 = word1; + word1 = word1->next; + } + } + } } - line0 = line4; - // otherwise, we're done with this block + } while (found); + + //~ need to compute the primary writing mode (horiz/vert) in + //~ addition to primary rotation + + // coalesce the block, and add it to the list + blk->coalesce(uMap); + if (lastBlk) { + lastBlk->next = blk; } else { - break; + blkList = blk; + } + lastBlk = blk; + count[rot] += blk->charCount; + if (primaryRot < 0 || count[rot] > count[primaryRot]) { + primaryRot = rot; } + ++nBlocks; } + } - // insert block on list, in yx order - if (rawOrder) { - blk2 = blk0; - blk3 = NULL; - blk0 = blk1; - } else { - for (blk2 = NULL, blk3 = yxBlocks; - blk3 && !blk1->yxBefore(blk3); - blk2 = blk3, blk3 = blk3->next) ; - } - blk1->next = blk3; - if (blk2) { - blk2->next = blk1; - } else { - yxBlocks = blk1; - } +#if 0 // for debugging + printf("*** rotation ***\n"); + for (rot = 0; rot < 4; ++rot) { + printf(" %d: %6d\n", rot, count[rot]); } + printf(" primary rot = %d\n", primaryRot); + printf("\n"); +#endif #if 0 // for debugging - printf("*** blocks in yx order ***\n"); - for (blk0 = yxBlocks; blk0; blk0 = blk0->next) { - printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", - blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); - for (line0 = blk0->lines; line0; line0 = line0->next) { - printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", - line0->xMin, line0->xMax, line0->yMin, line0->yMax, - line0->yBase, line0->len); - for (word0 = line0->words; word0; word0 = word0->next) { - printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + printf("*** blocks ***\n"); + for (blk = blkList; blk; blk = blk->next) { + printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n", + blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax); + for (line = blk->lines; line; line = line->next) { + printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n", + line->xMin, line->xMax, line->yMin, line->yMax, line->base); + for (word0 = line->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, - word0->yBase, word0->spaceAfter); + word0->base, word0->fontSize, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } @@ -1164,148 +2337,109 @@ void TextPage::coalesce(GBool physLayout) { } } printf("\n"); - fflush(stdout); #endif - //----- merge lines and blocks, sort blocks into reading order - - if (rawOrder) { - blocks = yxBlocks; - - } else { - blocks = NULL; - blk0 = NULL; - blkStack = NULL; - while (yxBlocks) { - - // find the next two blocks: - // - if the depth-first traversal stack is empty, take the first - // (upper-left-most) two blocks on the yx-sorted block list - // - otherwise, find the two upper-left-most blocks under the top - // block on the stack - if (blkStack) { - blk3 = blk4 = blk5 = blk6 = NULL; - for (blk1 = NULL, blk2 = yxBlocks; - blk2; - blk1 = blk2, blk2 = blk2->next) { - if (blk2->yMin > blkStack->yMin && - blk2->xMax > blkStack->xMin && - blk2->xMin < blkStack->xMax) { - if (!blk4 || blk2->yxBefore(blk4)) { - blk5 = blk3; - blk6 = blk4; - blk3 = blk1; - blk4 = blk2; - } else if (!blk6 || blk2->yxBefore(blk6)) { - blk5 = blk1; - blk6 = blk2; - } + // determine the primary direction + lrCount = 0; + for (blk = blkList; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + for (word0 = line->words; word0; word0 = word0->next) { + for (i = 0; i < word0->len; ++i) { + if (unicodeTypeL(word0->text[i])) { + ++lrCount; + } else if (unicodeTypeR(word0->text[i])) { + --lrCount; } } - } else { - blk3 = NULL; - blk4 = yxBlocks; - blk5 = yxBlocks; - blk6 = yxBlocks->next; } + } + } + primaryLR = lrCount >= 0; - // merge case 1: - // | | | - // | blkStack | | blkStack - // +---------------------+ --> +-------------- - // +------+ +------+ +-----------+ - // | blk4 | | blk6 | ... | blk4+blk6 | - // +------+ +------+ +-----------+ - yLimit = 0; // make gcc happy - if (blkStack) { - yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize; - } - if (blkStack && blk4 && blk6 && - !blk4->lines->next && !blk6->lines->next && - lineFit2(blk4->lines, blk6->lines) && - blk4->yMin < yLimit && - blk4->xMin > blkStack->xSpaceL && - blkStack->xMin > blk4->xSpaceL && - blk6->xMax < blkStack->xSpaceR) { - blk4->mergeRight(blk6); - if (blk5) { - blk5->next = blk6->next; +#if 0 // for debugging + printf("*** direction ***\n"); + printf("lrCount = %d\n", lrCount); + printf("primaryLR = %d\n", primaryLR); +#endif + + //----- column assignment + + // sort blocks into xy order for column assignment + blocks = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *)); + for (blk = blkList, i = 0; blk; blk = blk->next, ++i) { + blocks[i] = blk; + } + qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot); + + // column assignment + for (i = 0; i < nBlocks; ++i) { + blk0 = blocks[i]; + col1 = 0; + for (j = 0; j < i; ++j) { + blk1 = blocks[j]; + col2 = 0; // make gcc happy + switch (primaryRot) { + case 0: + if (blk0->xMin > blk1->xMax) { + col2 = blk1->col + blk1->nColumns + 3; } else { - yxBlocks = blk6->next; + col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / + (blk1->xMax - blk1->xMin)) * + blk1->nColumns); } - delete blk6; - - // merge case 2: - // | | | | - // | blkStack | | | - // +---------------------+ --> | blkStack+blk2 | - // +---------------------+ | | - // | blk4 | | | - // | | | | - } else if (blkStack && blk4 && - blk4->yMin < yLimit && - blockFit2(blkStack, blk4)) { - blkStack->mergeBelow(blk4); - if (blk3) { - blk3->next = blk4->next; + break; + case 1: + if (blk0->yMin > blk1->yMax) { + col2 = blk1->col + blk1->nColumns + 3; } else { - yxBlocks = blk4->next; + col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / + (blk1->yMax - blk1->yMin)) * + blk1->nColumns); } - delete blk4; - - // if any of: - // 1. no block found - // 2. non-fully overlapping block found - // 3. large vertical gap above the overlapping block - // then pop the stack and try again - } else if (!blk4 || - (blkStack && (blk4->xMin < blkStack->xSpaceL || - blk4->xMax > blkStack->xSpaceR || - blk4->yMin - blkStack->yMax > - blkMaxSortSpacing * blkStack->maxFontSize))) { - blkStack = blkStack->stackNext; - - // add a block to the sorted list - } else { - - // remove the block from the yx-sorted list - if (blk3) { - blk3->next = blk4->next; + break; + case 2: + if (blk0->xMax < blk1->xMin) { + col2 = blk1->col + blk1->nColumns + 3; } else { - yxBlocks = blk4->next; + col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / + (blk1->xMin - blk1->xMax)) * + blk1->nColumns); } - blk4->next = NULL; - - // append the block to the reading-order list - if (blk0) { - blk0->next = blk4; + break; + case 3: + if (blk0->yMax < blk1->yMin) { + col2 = blk1->col + blk1->nColumns + 3; } else { - blocks = blk4; - } - blk0 = blk4; - - // push the block on the traversal stack - if (!physLayout) { - blk4->stackNext = blkStack; - blkStack = blk4; + col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / + (blk1->yMin - blk1->yMax)) * + blk1->nColumns); } + break; + } + if (col2 > col1) { + col1 = col2; + } + } + blk0->col = col1; + for (line = blk0->lines; line; line = line->next) { + for (j = 0; j <= line->len; ++j) { + line->col[j] += col1; } } - } // (!rawOrder) + } #if 0 // for debugging - printf("*** blocks in reading order (after merging) ***\n"); - for (blk0 = blocks; blk0; blk0 = blk0->next) { - printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", - blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); - for (line0 = blk0->lines; line0; line0 = line0->next) { - printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", - line0->xMin, line0->xMax, line0->yMin, line0->yMax, - line0->yBase, line0->len); - for (word0 = line0->words; word0; word0 = word0->next) { - printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + printf("*** blocks, after column assignment ***\n"); + for (blk = blkList; blk; blk = blk->next) { + printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n", + blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col, + blk->nColumns); + for (line = blk->lines; line; line = line->next) { + printf(" line:\n"); + for (word0 = line->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, - word0->yBase, word0->spaceAfter); + word0->base, word0->fontSize, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } @@ -1314,409 +2448,314 @@ void TextPage::coalesce(GBool physLayout) { } } printf("\n"); - fflush(stdout); #endif - //----- assemble blocks into flows - - if (rawOrder) { - - // one flow per block - flow0 = NULL; - while (blocks) { - flow1 = new TextFlow(); - flow1->blocks = blocks; - flow1->lines = blocks->lines; - flow1->yMin = blocks->yMin; - flow1->yMax = blocks->yMax; - blocks = blocks->next; - flow1->blocks->next = NULL; - if (flow0) { - flow0->next = flow1; - } else { - flows = flow1; - } - flow0 = flow1; - } - - } else { - - // compute whitespace above and below each block - for (blk0 = blocks; blk0; blk0 = blk0->next) { - blk0->ySpaceT = 0; - blk0->ySpaceB = pageHeight; - - // check each horizontally overlapping block - for (blk1 = blocks; blk1; blk1 = blk1->next) { - if (blk1 != blk0 && - blk1->xMin < blk0->xMax && - blk1->xMax > blk0->xMin) { - if (blk1->yMax < blk0->yMin) { - if (blk1->yMax > blk0->ySpaceT) { - blk0->ySpaceT = blk1->yMax; - } - } else if (blk1->yMin > blk0->yMax) { - if (blk1->yMin < blk0->ySpaceB) { - blk0->ySpaceB = blk1->yMin; - } - } - } - } - } - - flow0 = NULL; - while (blocks) { - - // build a new flow object - flow1 = new TextFlow(); - flow1->blocks = blocks; - flow1->lines = blocks->lines; - flow1->yMin = blocks->yMin; - flow1->yMax = blocks->yMax; - flow1->ySpaceT = blocks->ySpaceT; - flow1->ySpaceB = blocks->ySpaceB; - - // find subsequent blocks in the flow - for (blk1 = blocks, blk2 = blocks->next; - blk2 && flowFit(flow1, blk2); - blk1 = blk2, blk2 = blk2->next) { - if (blk2->yMin < flow1->yMin) { - flow1->yMin = blk2->yMin; - } - if (blk2->yMax > flow1->yMax) { - flow1->yMax = blk2->yMax; - } - if (blk2->ySpaceT > flow1->ySpaceT) { - flow1->ySpaceT = blk2->ySpaceT; - } - if (blk2->ySpaceB < flow1->ySpaceB) { - flow1->ySpaceB = blk2->ySpaceB; - } - for (line1 = blk1->lines; line1->next; line1 = line1->next) ; - line1->flowNext = blk2->lines; - } + //----- reading order sort - // chop the block list - blocks = blk1->next; - blk1->next = NULL; + // sort blocks into yx order (in preparation for reading order sort) + qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot); - // append the flow to the list - if (flow0) { - flow0->next = flow1; - } else { - flows = flow1; + // compute space on left and right sides of each block + for (i = 0; i < nBlocks; ++i) { + blk0 = blocks[i]; + for (j = 0; j < nBlocks; ++j) { + blk1 = blocks[j]; + if (blk1 != blk0) { + blk0->updatePriMinMax(blk1); } - flow0 = flow1; } } #if 0 // for debugging - printf("*** flows ***\n"); - for (flow0 = flows; flow0; flow0 = flow0->next) { - printf("[flow]\n"); - for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) { - printf(" [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n", - blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax, - blk0->ySpaceT, blk0->ySpaceB); - for (line0 = blk0->lines; line0; line0 = line0->next) { - printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", - line0->xMin, line0->xMax, line0->yMin, line0->yMax, - line0->yBase, line0->len); - for (word0 = line0->words; word0; word0 = word0->next) { - printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", - word0->xMin, word0->xMax, word0->yMin, word0->yMax, - word0->yBase, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { - fputc(word0->text[i] & 0xff, stdout); - } - printf("'\n"); + printf("*** blocks, after yx sort ***\n"); + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; + printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n", + blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, + blk->priMin, blk->priMax); + for (line = blk->lines; line; line = line->next) { + printf(" line:\n"); + for (word0 = line->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->base, word0->fontSize, word0->spaceAfter); + for (j = 0; j < word0->len; ++j) { + fputc(word0->text[j] & 0xff, stdout); } + printf("'\n"); } } } printf("\n"); - fflush(stdout); #endif - //----- sort lines into yx order - - // (the block/line merging process doesn't maintain the full-page - // linked list of lines) + // build the flows + //~ this needs to be adjusted for writing mode (vertical text) + //~ this also needs to account for right-to-left column ordering + blkArray = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *)); + memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *)); + flows = lastFlow = NULL; + firstBlkIdx = 0; + nBlocksLeft = nBlocks; + while (nBlocksLeft > 0) { + + // find the upper-left-most block + for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ; + i = firstBlkIdx; + blk = blkArray[i]; + for (j = firstBlkIdx + 1; j < nBlocks; ++j) { + blk1 = blkArray[j]; + if (blk1) { + if (blk && blk->secondaryDelta(blk1) > 0) { + break; + } + if (blk1->primaryCmp(blk) < 0) { + i = j; + blk = blk1; + } + } + } + blkArray[i] = NULL; + --nBlocksLeft; + blk->next = NULL; - lines = NULL; - if (rawOrder) { - line0 = NULL; - for (flow0 = flows; flow0; flow0 = flow0->next) { - for (line1 = flow0->lines; line1; line1 = line1->flowNext) { - if (line0) { - line0->pageNext = line1; - } else { - lines = line1; + // create a new flow, starting with the upper-left-most block + flow = new TextFlow(this, blk); + if (lastFlow) { + lastFlow->next = flow; + } else { + flows = flow; + } + lastFlow = flow; + fontSize = blk->lines->words->fontSize; + + // push the upper-left-most block on the stack + blk->stackNext = NULL; + blkStack = blk; + + // find the other blocks in this flow + while (blkStack) { + + // find the upper-left-most block under (but within + // maxBlockSpacing of) the top block on the stack + blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize; + blk = NULL; + i = -1; + for (j = firstBlkIdx; j < nBlocks; ++j) { + blk1 = blkArray[j]; + if (blk1) { + if (blkStack->secondaryDelta(blk1) > blkSpace) { + break; + } + if (blk && blk->secondaryDelta(blk1) > 0) { + break; + } + if (blk1->isBelow(blkStack) && + (!blk || blk1->primaryCmp(blk) < 0)) { + i = j; + blk = blk1; + } } - line0 = line1; } - } - } else { - for (flow0 = flows; flow0; flow0 = flow0->next) { - for (line0 = flow0->lines; line0; line0 = line0->flowNext) { - for (line1 = NULL, line2 = lines; - line2 && !line0->yxBefore(line2); - line1 = line2, line2 = line2->pageNext) ; - if (line1) { - line1->pageNext = line0; - } else { - lines = line0; - } - line0->pageNext = line2; + + // if a suitable block was found, add it to the flow and push it + // onto the stack + if (blk && flow->blockFits(blk, blkStack)) { + blkArray[i] = NULL; + --nBlocksLeft; + blk->next = NULL; + flow->addBlock(blk); + fontSize = blk->lines->words->fontSize; + blk->stackNext = blkStack; + blkStack = blk; + + // otherwise (if there is no block under the top block or the + // block is not suitable), pop the stack + } else { + blkStack = blkStack->stackNext; } } } + gfree(blkArray); #if 0 // for debugging - printf("*** lines in yx order ***\n"); - for (line0 = lines; line0; line0 = line0->pageNext) { - printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n", - line0->xMin, line0->xMax, line0->yMin, line0->yMax, - line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0], - line0->len); - for (word0 = line0->words; word0; word0 = word0->next) { - printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", - word0->xMin, word0->xMax, word0->yMin, word0->yMax, - word0->yBase, word0->spaceAfter); - for (i = 0; i < word0->len; ++i) { - fputc(word0->text[i] & 0xff, stdout); + printf("*** flows ***\n"); + for (flow = flows; flow; flow = flow->next) { + printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n", + flow->xMin, flow->xMax, flow->yMin, flow->yMax, + flow->priMin, flow->priMax); + for (blk = flow->blocks; blk; blk = blk->next) { + printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n", + blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, + blk->priMin, blk->priMax); + for (line = blk->lines; line; line = line->next) { + printf(" line:\n"); + for (word0 = line->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->base, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } } - printf("'\n"); } } printf("\n"); - fflush(stdout); #endif -} -// If can be added the end of , return the absolute value -// of the difference between 's baseline and 's baseline, -// and set * to the horizontal space between the current last -// word in and . A smaller return value indicates a -// better fit. Otherwise, return a negative number. -double TextPage::lineFit(TextLine *line, TextWord *word, double *space) { - TextWord *lastWord; - double fontSize0, fontSize1; - double dx, dxLimit; - - lastWord = line->lastWord; - fontSize0 = line->fontSize; - fontSize1 = word->fontSize; - dx = word->xMin - lastWord->xMax; - dxLimit = fontSize0 * lastWord->font->maxSpaceWidth; - - // check inter-word spacing - if (dx < fontSize0 * lineMinDeltaX || - dx > dxLimit) { - return -1; - } - - if ( - // look for adjacent words with close baselines and close font sizes - (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 && - fontSize0 < lineMaxFontSizeRatio * fontSize1 && - fontSize1 < lineMaxFontSizeRatio * fontSize0) || - - // look for a superscript - (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 && - fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 && - (word->yMax < lastWord->yMax || - word->yBase < lastWord->yBase) && - word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 && - dx < fontSize0 * lineMaxSuperscriptDeltaX) || - - // look for a subscript - (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 && - fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 && - (word->yMin > lastWord->yMin || - word->yBase > lastWord->yBase) && - line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 && - dx < fontSize0 * lineMaxSubscriptDeltaX)) { - - *space = dx; - return fabs(word->yBase - line->yBase); - } - - return -1; -} - -// Returns true if and can be merged into a single -// line, ignoring max word spacing. -GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) { - double fontSize0, fontSize1; - double dx; - - fontSize0 = line0->fontSize; - fontSize1 = line1->fontSize; - dx = line1->xMin - line0->xMax; - - // check inter-word spacing - if (dx < fontSize0 * lineMinDeltaX) { - return gFalse; - } - - // look for close baselines and close font sizes - if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 && - fontSize0 < lineMaxFontSizeRatio * fontSize1 && - fontSize1 < lineMaxFontSizeRatio * fontSize0) { - return gTrue; + if (uMap) { + uMap->decRefCnt(); } - - return gFalse; } -// Returns true if can be added to . Assumes the y -// coordinates are within range. -GBool TextPage::blockFit(TextBlock *blk, TextLine *line) { - double fontSize0, fontSize1; - - // check edges - if (line->xMin < blk->xSpaceL || - line->xMax > blk->xSpaceR || - blk->xMin < line->xSpaceL || - blk->xMax > line->xSpaceR) { - return gFalse; - } - - // check font sizes - fontSize0 = blk->lines->fontSize; - fontSize1 = line->fontSize; - if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || - fontSize1 > blkMaxFontSizeRatio * fontSize0) { - return gFalse; - } - - return gTrue; -} +GBool TextPage::findText(Unicode *s, int len, + GBool startAtTop, GBool stopAtBottom, + GBool startAtLast, GBool stopAtLast, + double *xMin, double *yMin, + double *xMax, double *yMax) { + TextBlock *blk; + TextLine *line; + Unicode *p; + Unicode u1, u2; + int m, i, j, k; + double xStart, yStart, xStop, yStop; + double xMin0, yMin0, xMax0, yMax0; + double xMin1, yMin1, xMax1, yMax1; + GBool found; -// Returns true if and can be merged into a single -// block. Assumes the y coordinates are within range. -GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) { - double fontSize0, fontSize1; + //~ needs to handle right-to-left text - // check edges - if (blk1->xMin < blk0->xSpaceL || - blk1->xMax > blk0->xSpaceR || - blk0->xMin < blk1->xSpaceL || - blk0->xMax > blk1->xSpaceR) { + if (rawOrder) { return gFalse; } - // check font sizes - fontSize0 = blk0->lines->fontSize; - fontSize1 = blk1->lines->fontSize; - if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || - fontSize1 > blkMaxFontSizeRatio * fontSize0) { - return gFalse; + xStart = yStart = xStop = yStop = 0; + if (startAtLast && haveLastFind) { + xStart = lastFindXMin; + yStart = lastFindYMin; + } else if (!startAtTop) { + xStart = *xMin; + yStart = *yMin; } - - return gTrue; -} - -// Returns true if can be added to . -GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) { - double dy; - - // check whitespace above and below - if (blk->yMin < flow->ySpaceT || - blk->yMax > flow->ySpaceB || - flow->yMin < blk->ySpaceT || - flow->yMax > blk->ySpaceB) { - return gFalse; + if (stopAtLast && haveLastFind) { + xStop = lastFindXMin; + yStop = lastFindYMin; + } else if (!stopAtBottom) { + xStop = *xMax; + yStop = *yMax; } - // check that block top edge is within +/- dy of flow top edge, - // and that block bottom edge is above flow bottom edge + dy - dy = flowMaxDeltaY * flow->blocks->maxFontSize; - return blk->yMin > flow->yMin - dy && - blk->yMin < flow->yMin + dy && - blk->yMax < flow->yMax + dy; -} - + found = gFalse; + xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy + xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy -GBool TextPage::findText(Unicode *s, int len, - GBool top, GBool bottom, - double *xMin, double *yMin, - double *xMax, double *yMax) { - TextLine *line; - Unicode *p; - Unicode u1, u2; - int m, i, j; - double x0, x1, x; + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; - // scan all text on the page - for (line = lines; line; line = line->pageNext) { - - // check: above top limit? - if (!top && (line->yMax < *yMin || - (line->yMin < *yMin && line->xMax <= *xMin))) { + // check: is the block above the top limit? + if (!startAtTop && blk->yMax < yStart) { continue; } - // check: below bottom limit? - if (!bottom && (line->yMin > *yMax || - (line->yMax > *yMax && line->xMin >= *xMax))) { - return gFalse; + // check: is the block below the bottom limit? + if (!stopAtBottom && blk->yMin > yStop) { + break; } - // search each position in this line - m = line->len; - for (i = 0, p = line->text; i <= m - len; ++i, ++p) { + for (line = blk->lines; line; line = line->next) { - x0 = (i == 0) ? line->xMin : line->xRight[i-1]; - x1 = line->xRight[i]; - x = 0.5 * (x0 + x1); - - // check: above top limit? - if (!top && line->yMin < *yMin) { - if (x < *xMin) { - continue; - } + // check: is the line above the top limit? + if (!startAtTop && line->yMin < yStart) { + continue; } - // check: below bottom limit? - if (!bottom && line->yMax > *yMax) { - if (x > *xMax) { - return gFalse; - } + // check: is the line below the bottom limit? + if (!stopAtBottom && line->yMin > yStop) { + continue; } - // compare the strings - for (j = 0; j < len; ++j) { + // search each position in this line + m = line->len; + for (j = 0, p = line->text; j <= m - len; ++j, ++p) { + + // compare the strings + for (k = 0; k < len; ++k) { #if 1 //~ this lowercases Latin A-Z only -- this will eventually be //~ extended to handle other character sets - if (p[j] >= 0x41 && p[j] <= 0x5a) { - u1 = p[j] + 0x20; - } else { - u1 = p[j]; - } - if (s[j] >= 0x41 && s[j] <= 0x5a) { - u2 = s[j] + 0x20; - } else { - u2 = s[j]; - } + if (p[k] >= 0x41 && p[k] <= 0x5a) { + u1 = p[k] + 0x20; + } else { + u1 = p[k]; + } + if (s[k] >= 0x41 && s[k] <= 0x5a) { + u2 = s[k] + 0x20; + } else { + u2 = s[k]; + } #endif - if (u1 != u2) { - break; + if (u1 != u2) { + break; + } } - } - // found it - if (j == len) { - *xMin = x0; - *xMax = line->xRight[i + len - 1]; - *yMin = line->yMin; - *yMax = line->yMax; - return gTrue; + // found it + if (k == len) { + switch (line->rot) { + case 0: + xMin1 = line->edge[j]; + xMax1 = line->edge[j + len]; + yMin1 = line->yMin; + yMax1 = line->yMax; + break; + case 1: + xMin1 = line->xMin; + xMax1 = line->xMax; + yMin1 = line->edge[j]; + yMax1 = line->edge[j + len]; + break; + case 2: + xMin1 = line->edge[j + len]; + xMax1 = line->edge[j]; + yMin1 = line->yMin; + yMax1 = line->yMax; + break; + case 3: + xMin1 = line->xMin; + xMax1 = line->xMax; + yMin1 = line->edge[j + len]; + yMax1 = line->edge[j]; + break; + } + if ((startAtTop || + yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && + (stopAtBottom || + yMin1 < yStop || (yMin1 == yStop && xMin1 < yStop))) { + if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) { + xMin0 = xMin1; + xMax0 = xMax1; + yMin0 = yMin1; + yMax0 = yMax1; + found = gTrue; + } + } + } } } } + if (found) { + *xMin = xMin0; + *xMax = xMax0; + *yMin = yMin0; + *yMax = yMax0; + lastFindXMin = xMin0; + lastFindYMin = yMin0; + haveLastFind = gTrue; + return gTrue; + } + return gFalse; } @@ -1725,15 +2764,24 @@ GString *TextPage::getText(double xMin, double yMin, GString *s; UnicodeMap *uMap; GBool isUnicode; - char space[8], eol[16], buf[8]; - int spaceLen, eolLen, len; - TextLine *line, *prevLine; - double x0, x1, y; - int firstCol, col, i; - GBool multiLine; + TextBlock *blk; + TextLine *line; + TextLineFrag *frags; + int nFrags, fragsSize; + TextLineFrag *frag; + char space[8], eol[16]; + int spaceLen, eolLen; + int lastRot; + double x, y; + int col, idx0, idx1, i, j; + GBool multiLine, oneRot; s = new GString(); + if (rawOrder) { + return s; + } + // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { return s; @@ -1754,109 +2802,173 @@ GString *TextPage::getText(double xMin, double yMin, break; } - // find the leftmost column - firstCol = -1; - for (line = lines; line; line = line->pageNext) { - if (line->yMin > yMax) { - break; - } - if (line->yMax < yMin || - line->xMax < xMin || - line->xMin > xMax) { - continue; - } - - y = 0.5 * (line->yMin + line->yMax); - if (y < yMin || y > yMax) { - continue; - } - - i = 0; - while (i < line->len) { - x0 = (i==0) ? line->xMin : line->xRight[i-1]; - x1 = line->xRight[i]; - if (0.5 * (x0 + x1) > xMin) { - break; + //~ writing mode (horiz/vert) + + // collect the line fragments that are in the rectangle + fragsSize = 256; + frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag)); + nFrags = 0; + lastRot = -1; + oneRot = gTrue; + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; + if (xMin < blk->xMax && blk->xMin < xMax && + yMin < blk->yMax && blk->yMin < yMax) { + for (line = blk->lines; line; line = line->next) { + if (xMin < line->xMax && line->xMin < xMax && + yMin < line->yMax && line->yMin < yMax) { + idx0 = idx1 = -1; + switch (line->rot) { + case 0: + y = 0.5 * (line->yMin + line->yMax); + if (yMin < y && y < yMax) { + j = 0; + while (j < line->len) { + if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) { + idx0 = j; + break; + } + ++j; + } + j = line->len - 1; + while (j >= 0) { + if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) { + idx1 = j; + break; + } + --j; + } + } + break; + case 1: + x = 0.5 * (line->xMin + line->xMax); + if (xMin < x && x < xMax) { + j = 0; + while (j < line->len) { + if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) { + idx0 = j; + break; + } + ++j; + } + j = line->len - 1; + while (j >= 0) { + if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) { + idx1 = j; + break; + } + --j; + } + } + break; + case 2: + y = 0.5 * (line->yMin + line->yMax); + if (yMin < y && y < yMax) { + j = 0; + while (j < line->len) { + if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) { + idx0 = j; + break; + } + ++j; + } + j = line->len - 1; + while (j >= 0) { + if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) { + idx1 = j; + break; + } + --j; + } + } + break; + case 3: + x = 0.5 * (line->xMin + line->xMax); + if (xMin < x && x < xMax) { + j = 0; + while (j < line->len) { + if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) { + idx0 = j; + break; + } + ++j; + } + j = line->len - 1; + while (j >= 0) { + if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) { + idx1 = j; + break; + } + --j; + } + } + break; + } + if (idx0 >= 0 && idx1 >= 0) { + if (nFrags == fragsSize) { + fragsSize *= 2; + frags = (TextLineFrag *) + grealloc(frags, fragsSize * sizeof(TextLineFrag)); + } + frags[nFrags].init(line, idx0, idx1 - idx0 + 1); + ++nFrags; + if (lastRot >= 0 && line->rot != lastRot) { + oneRot = gFalse; + } + lastRot = line->rot; + } + } } - ++i; - } - if (i == line->len) { - continue; - } - col = line->col[i]; - - if (firstCol < 0 || col < firstCol) { - firstCol = col; } } - // extract the text - col = firstCol; - multiLine = gFalse; - prevLine = NULL; - for (line = lines; line; line = line->pageNext) { - if (line->yMin > yMax) { - break; - } - if (line->yMax < yMin || - line->xMax < xMin || - line->xMin > xMax) { - continue; - } - - y = 0.5 * (line->yMin + line->yMax); - if (y < yMin || y > yMax) { - continue; - } - - i = 0; - while (i < line->len) { - x0 = (i==0) ? line->xMin : line->xRight[i-1]; - x1 = line->xRight[i]; - if (0.5 * (x0 + x1) > xMin) { - break; - } - ++i; - } - if (i == line->len) { - continue; - } + // sort the fragments and generate the string + if (nFrags > 0) { - // insert a return - if (line->col[i] < col || - (prevLine && - line->yMin > - prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) { - s->append(eol, eolLen); - col = firstCol; - multiLine = gTrue; + for (i = 0; i < nFrags; ++i) { + frags[i].computeCoords(oneRot); } - prevLine = line; + assignColumns(frags, nFrags, oneRot); - // line this block up with the correct column - for (; col < line->col[i]; ++col) { - s->append(space, spaceLen); + // if all lines in the region have the same rotation, use it; + // otherwise, use the page's primary rotation + if (oneRot) { + qsort(frags, nFrags, sizeof(TextLineFrag), + &TextLineFrag::cmpYXLineRot); + } else { + qsort(frags, nFrags, sizeof(TextLineFrag), + &TextLineFrag::cmpYXPrimaryRot); } - // print the portion of the line - for (; i < line->len; ++i) { + col = 0; + multiLine = gFalse; + for (i = 0; i < nFrags; ++i) { + frag = &frags[i]; + + // insert a return + if (frag->col < col || + (i > 0 && fabs(frag->base - frags[i-1].base) > + maxIntraLineDelta * frags[i-1].line->words->fontSize)) { + s->append(eol, eolLen); + col = 0; + multiLine = gTrue; + } - x0 = (i==0) ? line->xMin : line->xRight[i-1]; - x1 = line->xRight[i]; - if (0.5 * (x0 + x1) > xMax) { - break; + // column alignment + for (; col < frag->col; ++col) { + s->append(space, spaceLen); } - len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); - s->append(buf, len); - col += isUnicode ? 1 : len; + // get the fragment text + col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); } - } - if (multiLine) { - s->append(eol, eolLen); + if (multiLine) { + s->append(eol, eolLen); + } } + gfree(frags); uMap->decRefCnt(); return s; @@ -1865,58 +2977,107 @@ GString *TextPage::getText(double xMin, double yMin, GBool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) { + TextBlock *blk; TextLine *line; TextWord *word; - double x; + double xMin0, xMax0, yMin0, yMax0; + double xMin1, xMax1, yMin1, yMax1; GBool first; - int i; + int i, j0, j1; + + if (rawOrder) { + return gFalse; + } //~ this doesn't correctly handle: //~ - ranges split across multiple lines (the highlighted region //~ is the bounding box of all the parts of the range) //~ - cases where characters don't convert one-to-one into Unicode first = gTrue; - for (line = lines; line; line = line->pageNext) { - for (word = line->words; word; word = word->next) { - if (pos < word->charPos + word->charLen && - word->charPos < pos + length) { - i = pos - word->charPos; - if (i < 0) { - i = 0; - } - x = (i == 0) ? word->xMin : word->xRight[i - 1]; - if (first || x < *xMin) { - *xMin = x; - } - i = pos + length - word->charPos; - if (i >= word->len) { - i = word->len - 1; - } - x = word->xRight[i]; - if (first || x > *xMax) { - *xMax = x; - } - if (first || word->yMin < *yMin) { - *yMin = word->yMin; - } - if (first || word->yMax > *yMax) { - *yMax = word->yMax; + xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy + xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; + for (line = blk->lines; line; line = line->next) { + for (word = line->words; word; word = word->next) { + if (pos < word->charPos + word->charLen && + word->charPos < pos + length) { + j0 = pos - word->charPos; + if (j0 < 0) { + j0 = 0; + } + j1 = pos + length - 1 - word->charPos; + if (j1 >= word->len) { + j1 = word->len - 1; + } + switch (line->rot) { + case 0: + xMin1 = word->edge[j0]; + xMax1 = word->edge[j1 + 1]; + yMin1 = word->yMin; + yMax1 = word->yMax; + break; + case 1: + xMin1 = word->xMin; + xMax1 = word->xMax; + yMin1 = word->edge[j0]; + yMax1 = word->edge[j1 + 1]; + break; + case 2: + xMin1 = word->edge[j1 + 1]; + xMax1 = word->edge[j0]; + yMin1 = word->yMin; + yMax1 = word->yMax; + break; + case 3: + xMin1 = word->xMin; + xMax1 = word->xMax; + yMin1 = word->edge[j1 + 1]; + yMax1 = word->edge[j0]; + break; + } + if (first || xMin1 < xMin0) { + xMin0 = xMin1; + } + if (first || xMax1 > xMax0) { + xMax0 = xMax1; + } + if (first || yMin1 < yMin0) { + yMin0 = yMin1; + } + if (first || yMax1 > yMax0) { + yMax0 = yMax1; + } + first = gFalse; } - first = gFalse; } } } - return !first; + if (!first) { + *xMin = xMin0; + *xMax = xMax0; + *yMin = yMin0; + *yMax = yMax0; + return gTrue; + } + return gFalse; } void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, GBool physLayout) { UnicodeMap *uMap; - char space[8], eol[16], eop[8], buf[8]; - int spaceLen, eolLen, eopLen, len; TextFlow *flow; + TextBlock *blk; TextLine *line; - int col, d, n, i; + TextLineFrag *frags; + TextWord *word; + int nFrags, fragsSize; + TextLineFrag *frag; + char space[8], eol[16], eop[8]; + int spaceLen, eolLen, eopLen; + GBool pageBreaks; + GString *s; + int col, i, d, n; // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { @@ -1937,69 +3098,119 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, break; } eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop)); + pageBreaks = globalParams->getTextPageBreaks(); - // output the page, maintaining the original physical layout - if (physLayout || rawOrder) { - col = 0; - for (line = lines; line; line = line->pageNext) { + //~ writing mode (horiz/vert) - // line this block up with the correct column - if (!rawOrder) { - for (; col < line->col[0]; ++col) { + // output the page in raw (content stream) order + if (rawOrder) { + + for (word = rawWords; word; word = word->next) { + s = new GString(); + dumpFragment(word->text, word->len, uMap, s); + (*outputFunc)(outputStream, s->getCString(), s->getLength()); + delete s; + if (word->next && + fabs(word->next->base - word->base) < + maxIntraLineDelta * word->fontSize) { + if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) { (*outputFunc)(outputStream, space, spaceLen); } + } else { + (*outputFunc)(outputStream, eol, eolLen); } + } - // print the line - for (i = 0; i < line->len; ++i) { - len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); - (*outputFunc)(outputStream, buf, len); + // output the page, maintaining the original physical layout + } else if (physLayout) { + + // collect the line fragments for the page and sort them + fragsSize = 256; + frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag)); + nFrags = 0; + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; + for (line = blk->lines; line; line = line->next) { + if (nFrags == fragsSize) { + fragsSize *= 2; + frags = (TextLineFrag *)grealloc(frags, + fragsSize * sizeof(TextLineFrag)); + } + frags[nFrags].init(line, 0, line->len); + frags[nFrags].computeCoords(gTrue); + ++nFrags; + } + } + qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot); + + // generate output + col = 0; + for (i = 0; i < nFrags; ++i) { + frag = &frags[i]; + + // column alignment + for (; col < frag->col; ++col) { + (*outputFunc)(outputStream, space, spaceLen); } - col += line->convertedLen; - // print one or more returns if necessary - if (rawOrder || - !line->pageNext || - line->pageNext->col[0] < col || - line->pageNext->yMin > - line->yMax - lineOverlapSlack * line->fontSize) { - - // compute number of returns - d = 1; - if (line->pageNext) { - d += (int)((line->pageNext->yMin - line->yMax) / - line->fontSize + 0.5); - } + // print the line + s = new GString(); + col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); + (*outputFunc)(outputStream, s->getCString(), s->getLength()); + delete s; - // various things (weird font matrices) can result in bogus - // values here, so do a sanity check - if (d < 1) { + // print one or more returns if necessary + if (i == nFrags - 1 || + frags[i+1].col < col || + fabs(frags[i+1].base - frag->base) > + maxIntraLineDelta * frag->line->words->fontSize) { + if (i < nFrags - 1) { + d = (int)((frags[i+1].base - frag->base) / + frag->line->words->fontSize); + if (d < 1) { + d = 1; + } else if (d > 5) { + d = 5; + } + } else { d = 1; - } else if (d > 5) { - d = 5; } for (; d > 0; --d) { (*outputFunc)(outputStream, eol, eolLen); } - col = 0; } } + gfree(frags); + // output the page, "undoing" the layout } else { for (flow = flows; flow; flow = flow->next) { - for (line = flow->lines; line; line = line->flowNext) { - n = line->len; - if (line->flowNext && line->hyphenated) { - --n; - } - for (i = 0; i < n; ++i) { - len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); - (*outputFunc)(outputStream, buf, len); - } - if (line->flowNext && !line->hyphenated) { - (*outputFunc)(outputStream, space, spaceLen); + for (blk = flow->blocks; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + n = line->len; + if (line->hyphenated && (line->next || blk->next)) { + --n; + } + s = new GString(); + dumpFragment(line->text, n, uMap, s); + (*outputFunc)(outputStream, s->getCString(), s->getLength()); + delete s; + if (!line->hyphenated) { + if (line->next) { + (*outputFunc)(outputStream, space, spaceLen); + } else if (blk->next) { + //~ this is a bit of a kludge - we should really do a more + //~ intelligent determination of paragraphs + if (blk->next->lines->words->fontSize == + blk->lines->words->fontSize) { + (*outputFunc)(outputStream, space, spaceLen); + } else { + (*outputFunc)(outputStream, eol, eolLen); + } + } + } } } (*outputFunc)(outputStream, eol, eolLen); @@ -2008,56 +3219,196 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, } // end of page - (*outputFunc)(outputStream, eop, eopLen); - (*outputFunc)(outputStream, eol, eolLen); + if (pageBreaks) { + (*outputFunc)(outputStream, eop, eopLen); + (*outputFunc)(outputStream, eol, eolLen); + } uMap->decRefCnt(); } -void TextPage::startPage(GfxState *state) { - clear(); - if (state) { - pageWidth = state->getPageWidth(); - pageHeight = state->getPageHeight(); +void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) { + TextLineFrag *frag0, *frag1; + int rot, col1, col2, i, j, k; + + // all text in the region has the same rotation -- recompute the + // column numbers based only on the text in the region + if (oneRot) { + qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot); + rot = frags[0].line->rot; + for (i = 0; i < nFrags; ++i) { + frag0 = &frags[i]; + col1 = 0; + for (j = 0; j < i; ++j) { + frag1 = &frags[j]; + col2 = 0; // make gcc happy + switch (rot) { + case 0: + if (frag0->xMin >= frag1->xMax) { + col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - + frag1->line->col[frag1->start]) + 1; + } else { + for (k = frag1->start; + k < frag1->start + frag1->len && + frag0->xMin >= 0.5 * (frag1->line->edge[k] + + frag1->line->edge[k+1]); + ++k) ; + col2 = frag1->col + + frag1->line->col[k] - frag1->line->col[frag1->start]; + } + break; + case 1: + if (frag0->yMin >= frag1->yMax) { + col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - + frag1->line->col[frag1->start]) + 1; + } else { + for (k = frag1->start; + k < frag1->start + frag1->len && + frag0->yMin >= 0.5 * (frag1->line->edge[k] + + frag1->line->edge[k+1]); + ++k) ; + col2 = frag1->col + + frag1->line->col[k] - frag1->line->col[frag1->start]; + } + break; + case 2: + if (frag0->xMax <= frag1->xMin) { + col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - + frag1->line->col[frag1->start]) + 1; + } else { + for (k = frag1->start; + k < frag1->start + frag1->len && + frag0->xMax <= 0.5 * (frag1->line->edge[k] + + frag1->line->edge[k+1]); + ++k) ; + col2 = frag1->col + + frag1->line->col[k] - frag1->line->col[frag1->start]; + } + break; + case 3: + if (frag0->yMax <= frag1->yMin) { + col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - + frag1->line->col[frag1->start]) + 1; + } else { + for (k = frag1->start; + k < frag1->start + frag1->len && + frag0->yMax <= 0.5 * (frag1->line->edge[k] + + frag1->line->edge[k+1]); + ++k) ; + col2 = frag1->col + + frag1->line->col[k] - frag1->line->col[frag1->start]; + } + break; + } + if (col2 > col1) { + col1 = col2; + } + } + frag0->col = col1; + } + + // the region includes text at different rotations -- use the + // globally assigned column numbers, offset by the minimum column + // number (i.e., shift everything over to column 0) } else { - pageWidth = pageHeight = 0; + col1 = frags[0].col; + for (i = 1; i < nFrags; ++i) { + if (frags[i].col < col1) { + col1 = frags[i].col; + } + } + for (i = 0; i < nFrags; ++i) { + frags[i].col -= col1; + } } } -void TextPage::clear() { - TextWord *w1, *w2; - TextFlow *f1, *f2; +int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, + GString *s) { + char lre[8], rle[8], popdf[8], buf[8]; + int lreLen, rleLen, popdfLen, n; + int nCols, i, j, k; + + nCols = 0; + + if (uMap->isUnicode()) { + + lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); + rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle)); + popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf)); + + if (primaryLR) { + + i = 0; + while (i < len) { + // output a left-to-right section + for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; + for (k = i; k < j; ++k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + i = j; + // output a right-to-left section + for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ; + if (j > i) { + s->append(rle, rleLen); + for (k = j - 1; k >= i; --k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + s->append(popdf, popdfLen); + i = j; + } + } + + } else { + + s->append(rle, rleLen); + i = len - 1; + while (i >= 0) { + // output a right-to-left section + for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ; + for (k = i; k > j; --k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + i = j; + // output a left-to-right section + for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ; + if (j < i) { + s->append(lre, lreLen); + for (k = j + 1; k <= i; ++k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + s->append(popdf, popdfLen); + i = j; + } + } + s->append(popdf, popdfLen); - if (curWord) { - delete curWord; - curWord = NULL; - } - if (words) { - for (w1 = words; w1; w1 = w2) { - w2 = w1->next; - delete w1; } - } else if (flows) { - for (f1 = flows; f1; f1 = f2) { - f2 = f1->next; - delete f1; + + } else { + for (i = 0; i < len; ++i) { + n = uMap->mapUnicode(text[i], buf, sizeof(buf)); + s->append(buf, n); + nCols += n; } } - deleteGList(fonts, TextFontInfo); - - curWord = NULL; - charPos = 0; - font = NULL; - fontSize = 0; - nest = 0; - nTinyChars = 0; - words = wordPtr = NULL; - lines = NULL; - flows = NULL; - fonts = new GList(); + return nCols; } +#if TEXTOUT_WORD_LIST +TextWordList *TextPage::makeWordList(GBool physLayout) { + return new TextWordList(this, physLayout); +} +#endif //------------------------------------------------------------------------ // TextOutputDev @@ -2127,6 +3478,7 @@ void TextOutputDev::startPage(int pageNum, GfxState *state) { } void TextOutputDev::endPage() { + text->endPage(); text->coalesce(physLayout); if (outputStream) { text->dump(outputStream, outputFunc, physLayout); @@ -2138,11 +3490,9 @@ void TextOutputDev::updateFont(GfxState *state) { } void TextOutputDev::beginString(GfxState *state, GString *s) { - text->beginWord(state, state->getCurX(), state->getCurY()); } void TextOutputDev::endString(GfxState *state) { - text->endWord(); } void TextOutputDev::drawChar(GfxState *state, double x, double y, @@ -2153,10 +3503,12 @@ void TextOutputDev::drawChar(GfxState *state, double x, double y, } GBool TextOutputDev::findText(Unicode *s, int len, - GBool top, GBool bottom, + GBool startAtTop, GBool stopAtBottom, + GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax) { - return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax); + return text->findText(s, len, startAtTop, stopAtBottom, + startAtLast, stopAtLast, xMin, yMin, xMax, yMax); } GString *TextOutputDev::getText(double xMin, double yMin, @@ -2170,4 +3522,8 @@ GBool TextOutputDev::findCharRange(int pos, int length, return text->findCharRange(pos, length, xMin, yMin, xMax, yMax); } - +#if TEXTOUT_WORD_LIST +TextWordList *TextOutputDev::makeWordList() { + return text->makeWordList(physLayout); +} +#endif