+#ifdef MACOS
+// needed for setting type/creator of MacOS files
+#include "ICSupport.h"
+#endif
+
+//------------------------------------------------------------------------
+// parameters
+//------------------------------------------------------------------------
+
+// Each bucket in a text pool includes baselines within a range of
+// this many points.
+#define textPoolStep 4
+
+// Inter-character space width which will cause addChar to start a new
+// word.
+#define minWordBreakSpace 0.1
+
+// Negative inter-character space width, i.e., overlap, which will
+// cause addChar to start a new word.
+#define minDupBreakOverlap 0.2
+
+// Max distance between baselines of two lines within a block, as a
+// fraction of the font size.
+#define maxLineSpacingDelta 1.5
+
+// Max difference in primary font sizes on two lines in the same
+// block. Delta1 is used when examining new lines above and below the
+// current block; delta2 is used when examining text that overlaps the
+// current block; delta3 is used when examining text to the left and
+// right of the current block.
+#define maxBlockFontSizeDelta1 0.05
+#define maxBlockFontSizeDelta2 0.6
+#define maxBlockFontSizeDelta3 0.2
+
+// Max difference in font sizes inside a word.
+#define maxWordFontSizeDelta 0.05
+
+// Maximum distance between baselines of two words on the same line,
+// e.g., distance between subscript or superscript and the primary
+// baseline, as a fraction of the font size.
+#define maxIntraLineDelta 0.5
+
+// Minimum inter-word spacing, as a fraction of the font size. (Only
+// used for raw ordering.)
+#define minWordSpacing 0.15
+
+// Maximum inter-word spacing, as a fraction of the font size.
+#define maxWordSpacing 1.5
+
+// Maximum horizontal spacing which will allow a word to be pulled
+// into a block.
+#define minColSpacing1 0.3
+
+// Minimum spacing between columns, as a fraction of the font size.
+#define minColSpacing2 1.0
+
+// Maximum vertical spacing between blocks within a flow, as a
+// multiple of the font size.
+#define maxBlockSpacing 2.5
+
+// Minimum spacing between characters within a word, as a fraction of
+// the font size.
+#define minCharSpacing -0.2
+
+// Maximum spacing between characters within a word, as a fraction of
+// the font size, when there is no obvious extra-wide character
+// spacing.
+#define maxCharSpacing 0.03
+
+// When extra-wide character spacing is detected, the inter-character
+// space threshold is set to the minimum inter-character space
+// multiplied by this constant.
+#define maxWideCharSpacingMul 1.3
+
+// Max difference in primary,secondary coordinates (as a fraction of
+// the font size) allowed for duplicated text (fake boldface, drop
+// shadows) which is to be discarded.
+#define dupMaxPriDelta 0.1
+#define dupMaxSecDelta 0.2
+
+//------------------------------------------------------------------------
+// TextFontInfo
+//------------------------------------------------------------------------
+
+TextFontInfo::TextFontInfo(GfxState *state) {
+ gfxFont = state->getFont();
+#if TEXTOUT_WORD_LIST
+ fontName = (gfxFont && gfxFont->getOrigName())
+ ? gfxFont->getOrigName()->copy()
+ : (GString *)NULL;
+#endif
+}
+
+TextFontInfo::~TextFontInfo() {
+#if TEXTOUT_WORD_LIST
+ if (fontName) {
+ delete fontName;
+ }
+#endif
+}
+
+GBool TextFontInfo::matches(GfxState *state) {
+ return state->getFont() == gfxFont;
+}
+
+//------------------------------------------------------------------------
+// TextWord
+//------------------------------------------------------------------------
+
+TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
+ int charPosA, TextFontInfo *fontA, double fontSizeA) {
+ GfxFont *gfxFont;
+ double x, y, ascent, descent;
+
+ rot = rotA;
+ charPos = charPosA;
+ charLen = 0;
+ font = fontA;
+ fontSize = fontSizeA;
+ state->transform(x0, y0, &x, &y);
+ if ((gfxFont = font->gfxFont)) {
+ ascent = gfxFont->getAscent() * fontSize;
+ descent = gfxFont->getDescent() * fontSize;
+ } else {
+ // this means that the PDF file draws text without a current font,
+ // which should never happen
+ ascent = 0.95 * fontSize;
+ descent = -0.35 * fontSize;
+ }
+ switch (rot) {
+ case 0:
+ yMin = y - ascent;
+ yMax = y - descent;
+ if (yMin == yMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ yMin = y;
+ yMax = y + 1;
+ }
+ base = y;
+ break;
+ case 1:
+ xMin = x + descent;
+ xMax = x + ascent;
+ if (xMin == xMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ xMin = x;
+ xMax = x + 1;
+ }
+ base = x;
+ break;
+ case 2:
+ yMin = y + descent;
+ yMax = y + ascent;
+ if (yMin == yMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ yMin = y;
+ yMax = y + 1;
+ }
+ base = y;
+ break;
+ case 3:
+ xMin = x - ascent;
+ xMax = x - descent;
+ if (xMin == xMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ xMin = x;
+ xMax = x + 1;
+ }
+ base = x;
+ break;
+ }
+ text = NULL;
+ edge = NULL;
+ len = size = 0;
+ spaceAfter = gFalse;
+ next = NULL;
+
+#if TEXTOUT_WORD_LIST
+ GfxRGB rgb;
+
+ if ((state->getRender() & 3) == 1) {
+ state->getStrokeRGB(&rgb);
+ } else {
+ state->getFillRGB(&rgb);
+ }
+ colorR = rgb.r;
+ colorG = rgb.g;
+ colorB = rgb.b;
+#endif
+}
+
+TextWord::~TextWord() {
+ gfree(text);
+ gfree(edge);
+}
+
+void TextWord::addChar(GfxState *state, double x, double y,
+ double dx, double dy, Unicode u) {
+ if (len == size) {
+ size += 16;
+ text = (Unicode *)grealloc(text, size * sizeof(Unicode));
+ edge = (double *)grealloc(edge, (size + 1) * sizeof(double));
+ }
+ text[len] = u;
+ switch (rot) {
+ case 0:
+ if (len == 0) {
+ xMin = x;
+ }
+ edge[len] = x;
+ xMax = edge[len+1] = x + dx;
+ break;
+ case 1:
+ if (len == 0) {
+ yMin = y;
+ }
+ edge[len] = y;
+ yMax = edge[len+1] = y + dy;
+ break;
+ case 2:
+ if (len == 0) {
+ xMax = x;
+ }
+ edge[len] = x;
+ xMin = edge[len+1] = x + dx;
+ break;
+ case 3:
+ if (len == 0) {
+ yMax = y;
+ }
+ edge[len] = y;
+ yMin = edge[len+1] = y + dy;
+ break;
+ }
+ ++len;
+}
+
+void TextWord::merge(TextWord *word) {
+ int i;
+
+ if (word->xMin < xMin) {
+ xMin = word->xMin;
+ }
+ if (word->yMin < yMin) {
+ yMin = word->yMin;
+ }
+ if (word->xMax > xMax) {
+ xMax = word->xMax;
+ }
+ if (word->yMax > yMax) {
+ yMax = word->yMax;
+ }
+ if (len + word->len > size) {
+ size = len + word->len;
+ text = (Unicode *)grealloc(text, size * sizeof(Unicode));
+ edge = (double *)grealloc(edge, (size + 1) * sizeof(double));
+ }
+ for (i = 0; i < word->len; ++i) {
+ text[len + i] = word->text[i];
+ edge[len + i] = word->edge[i];
+ }
+ edge[len + word->len] = word->edge[word->len];
+ len += word->len;
+ charLen += word->charLen;
+}
+
+inline int TextWord::primaryCmp(TextWord *word) {
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ cmp = xMin - word->xMin;
+ break;
+ case 1:
+ cmp = yMin - word->yMin;
+ break;
+ case 2:
+ cmp = word->xMax - xMax;
+ break;
+ case 3:
+ cmp = word->yMax - yMax;
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+double TextWord::primaryDelta(TextWord *word) {
+ double delta;
+
+ delta = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ delta = word->xMin - xMax;
+ break;
+ case 1:
+ delta = word->yMin - yMax;
+ break;
+ case 2:
+ delta = xMin - word->xMax;
+ break;
+ case 3:
+ delta = yMin - word->yMax;
+ break;
+ }
+ return delta;
+}
+
+int TextWord::cmpYX(const void *p1, const void *p2) {
+ TextWord *word1 = *(TextWord **)p1;
+ TextWord *word2 = *(TextWord **)p2;
+ double cmp;
+
+ cmp = word1->yMin - word2->yMin;
+ if (cmp == 0) {
+ cmp = word1->xMin - word2->xMin;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+#if TEXTOUT_WORD_LIST
+
+GString *TextWord::getText() {
+ GString *s;
+ UnicodeMap *uMap;
+ char buf[8];
+ int n, i;
+
+ s = new GString();
+ if (!(uMap = globalParams->getTextEncoding())) {
+ return s;
+ }
+ for (i = 0; i < len; ++i) {
+ n = uMap->mapUnicode(text[i], buf, sizeof(buf));
+ s->append(buf, n);
+ }
+ uMap->decRefCnt();
+ return s;
+}
+
+#endif // TEXTOUT_WORD_LIST
+
+//------------------------------------------------------------------------
+// TextPool
+//------------------------------------------------------------------------
+
+TextPool::TextPool() {
+ minBaseIdx = 0;
+ maxBaseIdx = -1;
+ pool = NULL;
+ cursor = NULL;
+ cursorBaseIdx = -1;
+}
+
+TextPool::~TextPool() {
+ int baseIdx;
+ TextWord *word, *word2;
+
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
+ for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
+ word2 = word->next;
+ delete word;
+ }
+ }
+ gfree(pool);
+}
+
+int TextPool::getBaseIdx(double base) {
+ int baseIdx;
+
+ baseIdx = (int)(base / textPoolStep);
+ if (baseIdx < minBaseIdx) {
+ return minBaseIdx;
+ }
+ if (baseIdx > maxBaseIdx) {
+ return maxBaseIdx;
+ }
+ return baseIdx;
+}
+
+void TextPool::addWord(TextWord *word) {
+ TextWord **newPool;
+ int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
+ TextWord *w0, *w1;
+
+ // expand the array if needed
+ wordBaseIdx = (int)(word->base / textPoolStep);
+ if (minBaseIdx > maxBaseIdx) {
+ minBaseIdx = wordBaseIdx - 128;
+ maxBaseIdx = wordBaseIdx + 128;
+ pool = (TextWord **)gmalloc((maxBaseIdx - minBaseIdx + 1) *
+ sizeof(TextWord *));
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
+ pool[baseIdx - minBaseIdx] = NULL;
+ }
+ } else if (wordBaseIdx < minBaseIdx) {
+ newMinBaseIdx = wordBaseIdx - 128;
+ newPool = (TextWord **)gmalloc((maxBaseIdx - newMinBaseIdx + 1) *
+ sizeof(TextWord *));
+ for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
+ newPool[baseIdx - newMinBaseIdx] = NULL;
+ }
+ memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
+ (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
+ gfree(pool);
+ pool = newPool;
+ minBaseIdx = newMinBaseIdx;
+ } else if (wordBaseIdx > maxBaseIdx) {
+ newMaxBaseIdx = wordBaseIdx + 128;
+ pool = (TextWord **)grealloc(pool, (newMaxBaseIdx - minBaseIdx + 1) *
+ sizeof(TextWord *));
+ for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
+ pool[baseIdx - minBaseIdx] = NULL;
+ }
+ maxBaseIdx = newMaxBaseIdx;
+ }
+
+ // insert the new word
+ if (cursor && wordBaseIdx == cursorBaseIdx &&
+ word->primaryCmp(cursor) > 0) {
+ w0 = cursor;
+ w1 = cursor->next;
+ } else {
+ w0 = NULL;
+ w1 = pool[wordBaseIdx - minBaseIdx];
+ }
+ for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
+ word->next = w1;
+ if (w0) {
+ w0->next = word;
+ } else {
+ pool[wordBaseIdx - minBaseIdx] = word;
+ }
+ cursor = word;
+ cursorBaseIdx = wordBaseIdx;
+}
+
+//------------------------------------------------------------------------
+// TextLine
+//------------------------------------------------------------------------
+
+TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
+ blk = blkA;
+ rot = rotA;
+ xMin = yMin = 0;
+ xMax = yMax = -1;
+ base = baseA;
+ words = lastWord = NULL;
+ text = NULL;
+ edge = NULL;
+ col = NULL;
+ len = 0;
+ convertedLen = 0;
+ hyphenated = gFalse;
+ next = NULL;
+}
+
+TextLine::~TextLine() {
+ TextWord *word;
+
+ while (words) {
+ word = words;
+ words = words->next;
+ delete word;
+ }
+ gfree(text);
+ gfree(edge);
+ gfree(col);
+}
+
+void TextLine::addWord(TextWord *word) {
+ if (lastWord) {
+ lastWord->next = word;
+ } else {
+ words = word;
+ }
+ lastWord = word;
+
+ if (xMin > xMax) {
+ xMin = word->xMin;
+ xMax = word->xMax;
+ yMin = word->yMin;
+ yMax = word->yMax;
+ } else {
+ if (word->xMin < xMin) {
+ xMin = word->xMin;
+ }
+ if (word->xMax > xMax) {
+ xMax = word->xMax;
+ }
+ if (word->yMin < yMin) {
+ yMin = word->yMin;
+ }
+ if (word->yMax > yMax) {
+ yMax = word->yMax;
+ }
+ }
+}
+
+double TextLine::primaryDelta(TextLine *line) {
+ double delta;
+
+ delta = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ delta = line->xMin - xMax;
+ break;
+ case 1:
+ delta = line->yMin - yMax;
+ break;
+ case 2:
+ delta = xMin - line->xMax;
+ break;
+ case 3:
+ delta = yMin - line->yMax;
+ break;
+ }
+ return delta;
+}
+
+int TextLine::primaryCmp(TextLine *line) {
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ cmp = xMin - line->xMin;
+ break;
+ case 1:
+ cmp = yMin - line->yMin;
+ break;
+ case 2:
+ cmp = line->xMax - xMax;
+ break;
+ case 3:
+ cmp = line->yMax - yMax;
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextLine::secondaryCmp(TextLine *line) {
+ double cmp;
+
+ cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextLine::cmpYX(TextLine *line) {
+ int cmp;
+
+ if ((cmp = secondaryCmp(line))) {
+ return cmp;
+ }
+ return primaryCmp(line);
+}
+
+int TextLine::cmpXY(const void *p1, const void *p2) {
+ TextLine *line1 = *(TextLine **)p1;
+ TextLine *line2 = *(TextLine **)p2;
+ int cmp;
+
+ if ((cmp = line1->primaryCmp(line2))) {
+ return cmp;
+ }
+ return line1->secondaryCmp(line2);
+}
+
+void TextLine::coalesce(UnicodeMap *uMap) {
+ TextWord *word0, *word1;
+ double space, delta, minSpace;
+ GBool isUnicode;
+ char buf[8];
+ int i, j;
+
+ if (words->next) {
+
+ // compute the inter-word space threshold
+ if (words->len > 1 || words->next->len > 1) {
+ minSpace = 0;
+ } else {
+ minSpace = words->primaryDelta(words->next);
+ for (word0 = words->next, word1 = word0->next;
+ word1 && minSpace > 0;
+ word0 = word1, word1 = word0->next) {
+ if (word1->len > 1) {
+ minSpace = 0;
+ }
+ delta = word0->primaryDelta(word1);
+ if (delta < minSpace) {
+ minSpace = delta;
+ }
+ }
+ }
+ if (minSpace <= 0) {
+ space = maxCharSpacing * words->fontSize;
+ } else {
+ space = maxWideCharSpacingMul * minSpace;
+ }
+
+ // merge words
+ word0 = words;
+ word1 = words->next;
+ while (word1) {
+ if (word0->primaryDelta(word1) >= space) {
+ word0->spaceAfter = gTrue;
+ word0 = word1;
+ word1 = word1->next;
+ } else if (word0->font == word1->font &&
+ fabs(word0->fontSize - word1->fontSize) <
+ maxWordFontSizeDelta * words->fontSize &&
+ word1->charPos == word0->charPos + word0->charLen) {
+ word0->merge(word1);
+ word0->next = word1->next;
+ delete word1;
+ word1 = word0->next;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
+ }
+ }
+ }
+
+ // build the line text
+ isUnicode = uMap ? uMap->isUnicode() : gFalse;
+ len = 0;
+ for (word1 = words; word1; word1 = word1->next) {
+ len += word1->len;
+ if (word1->spaceAfter) {
+ ++len;
+ }
+ }
+ text = (Unicode *)gmalloc(len * sizeof(Unicode));
+ edge = (double *)gmalloc((len + 1) * sizeof(double));
+ i = 0;
+ for (word1 = words; word1; word1 = word1->next) {
+ for (j = 0; j < word1->len; ++j) {
+ text[i] = word1->text[j];
+ edge[i] = word1->edge[j];
+ ++i;
+ }
+ edge[i] = word1->edge[word1->len];
+ if (word1->spaceAfter) {
+ text[i] = (Unicode)0x0020;
+ ++i;
+ }
+ }
+
+ // compute convertedLen and set up the col array
+ col = (int *)gmalloc((len + 1) * sizeof(int));
+ convertedLen = 0;
+ for (i = 0; i < len; ++i) {
+ col[i] = convertedLen;
+ if (isUnicode) {
+ ++convertedLen;
+ } else if (uMap) {
+ convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
+ }
+ }
+ col[len] = convertedLen;
+
+ // check for hyphen at end of line
+ //~ need to check for other chars used as hyphens
+ hyphenated = text[len - 1] == (Unicode)'-';
+}
+
+//------------------------------------------------------------------------
+// TextLineFrag
+//------------------------------------------------------------------------
+
+class TextLineFrag {
+public:
+
+ TextLine *line; // the line object
+ int start, len; // offset and length of this fragment
+ // (in Unicode chars)
+ double xMin, xMax; // bounding box coordinates
+ double yMin, yMax;
+ double base; // baseline virtual coordinate
+ int col; // first column
+
+ void init(TextLine *lineA, int startA, int lenA);
+ void computeCoords(GBool oneRot);
+
+ static int cmpYXPrimaryRot(const void *p1, const void *p2);
+ static int cmpYXLineRot(const void *p1, const void *p2);
+ static int cmpXYLineRot(const void *p1, const void *p2);
+};
+
+void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
+ line = lineA;
+ start = startA;
+ len = lenA;
+ col = line->col[start];
+}
+
+void TextLineFrag::computeCoords(GBool oneRot) {
+ TextBlock *blk;
+ double d0, d1, d2, d3, d4;
+
+ if (oneRot) {
+
+ switch (line->rot) {
+ case 0:
+ xMin = line->edge[start];
+ xMax = line->edge[start + len];
+ yMin = line->yMin;
+ yMax = line->yMax;
+ break;
+ case 1:
+ xMin = line->xMin;
+ xMax = line->xMax;
+ yMin = line->edge[start];
+ yMax = line->edge[start + len];
+ break;
+ case 2:
+ xMin = line->edge[start + len];
+ xMax = line->edge[start];
+ yMin = line->yMin;
+ yMax = line->yMax;
+ break;
+ case 3:
+ xMin = line->xMin;
+ xMax = line->xMax;
+ yMin = line->edge[start + len];
+ yMax = line->edge[start];
+ break;
+ }
+ base = line->base;
+
+ } else {
+
+ if (line->rot == 0 && line->blk->page->primaryRot == 0) {
+
+ xMin = line->edge[start];
+ xMax = line->edge[start + len];
+ yMin = line->yMin;
+ yMax = line->yMax;
+ base = line->base;
+
+ } else {
+
+ blk = line->blk;
+ d0 = line->edge[start];
+ d1 = line->edge[start + len];
+ d2 = d3 = d4 = 0; // make gcc happy
+
+ switch (line->rot) {
+ case 0:
+ d2 = line->yMin;
+ d3 = line->yMax;
+ d4 = line->base;
+ d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
+ d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
+ d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
+ d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
+ d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
+ break;
+ case 1:
+ d2 = line->xMax;
+ d3 = line->xMin;
+ d4 = line->base;
+ d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
+ d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
+ d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
+ d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
+ d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
+ break;
+ case 2:
+ d2 = line->yMax;
+ d3 = line->yMin;
+ d4 = line->base;
+ d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
+ d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
+ d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
+ d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
+ d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
+ break;
+ case 3:
+ d2 = line->xMin;
+ d3 = line->xMax;
+ d4 = line->base;
+ d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
+ d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
+ d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
+ d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
+ d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
+ break;
+ }
+
+ switch (line->blk->page->primaryRot) {
+ case 0:
+ xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
+ xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
+ yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
+ yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
+ base = blk->yMin + base * (blk->yMax - blk->yMin);
+ break;
+ case 1:
+ xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
+ xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
+ yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
+ yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
+ base = blk->xMax - d4 * (blk->xMax - blk->xMin);
+ break;
+ case 2:
+ xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
+ xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
+ yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
+ yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
+ base = blk->yMax - d4 * (blk->yMax - blk->yMin);
+ break;
+ case 3:
+ xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
+ xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
+ yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
+ yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
+ base = blk->xMin + d4 * (blk->xMax - blk->xMin);
+ break;
+ }
+
+ }
+ }
+}
+
+int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (frag1->line->blk->page->primaryRot) {
+ case 0:
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
+ cmp = frag1->xMin - frag2->xMin;
+ }
+ break;
+ case 1:
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
+ cmp = frag1->yMin - frag2->yMin;
+ }
+ break;
+ case 2:
+ if ((cmp = frag2->yMin - frag1->yMin) == 0) {
+ cmp = frag2->xMax - frag1->xMax;
+ }
+ break;
+ case 3:
+ if ((cmp = frag1->xMax - frag2->xMax) == 0) {
+ cmp = frag2->yMax - frag1->yMax;
+ }
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (frag1->line->rot) {
+ case 0:
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
+ cmp = frag1->xMin - frag2->xMin;
+ }
+ break;
+ case 1:
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
+ cmp = frag1->yMin - frag2->yMin;
+ }
+ break;
+ case 2:
+ if ((cmp = frag2->yMin - frag1->yMin) == 0) {
+ cmp = frag2->xMax - frag1->xMax;
+ }
+ break;
+ case 3:
+ if ((cmp = frag1->xMax - frag2->xMax) == 0) {
+ cmp = frag2->yMax - frag1->yMax;
+ }
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (frag1->line->rot) {
+ case 0:
+ if ((cmp = frag1->xMin - frag2->xMin) == 0) {
+ cmp = frag1->yMin - frag2->yMin;
+ }
+ break;
+ case 1:
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
+ cmp = frag2->xMax - frag1->xMax;
+ }
+ break;
+ case 2:
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
+ cmp = frag2->yMin - frag1->yMin;
+ }
+ break;
+ case 3:
+ if ((cmp = frag2->yMax - frag1->yMax) == 0) {
+ cmp = frag1->xMax - frag2->xMax;
+ }
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+//------------------------------------------------------------------------
+// TextBlock
+//------------------------------------------------------------------------
+
+TextBlock::TextBlock(TextPage *pageA, int rotA) {
+ page = pageA;
+ rot = rotA;
+ xMin = yMin = 0;
+ xMax = yMax = -1;
+ priMin = 0;
+ priMax = page->pageWidth;
+ pool = new TextPool();
+ lines = NULL;
+ curLine = NULL;
+ next = NULL;
+ stackNext = NULL;
+}
+
+TextBlock::~TextBlock() {
+ TextLine *line;
+
+ delete pool;
+ while (lines) {
+ line = lines;
+ lines = lines->next;
+ delete line;
+ }
+}
+
+void TextBlock::addWord(TextWord *word) {
+ pool->addWord(word);
+ if (xMin > xMax) {
+ xMin = word->xMin;
+ xMax = word->xMax;
+ yMin = word->yMin;
+ yMax = word->yMax;
+ } else {
+ if (word->xMin < xMin) {
+ xMin = word->xMin;
+ }
+ if (word->xMax > xMax) {
+ xMax = word->xMax;
+ }
+ if (word->yMin < yMin) {
+ yMin = word->yMin;
+ }
+ if (word->yMax > yMax) {
+ yMax = word->yMax;
+ }
+ }
+}
+
+void TextBlock::coalesce(UnicodeMap *uMap) {
+ TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
+ TextLine *line, *line0, *line1;
+ int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
+ int baseIdx, bestWordBaseIdx, idx0, idx1;
+ double minBase, maxBase;
+ double fontSize, delta, priDelta, secDelta;
+ TextLine **lineArray;
+ GBool found;
+ int col1, col2;
+ int i, j, k;
+
+ // discard duplicated text (fake boldface, drop shadows)
+ for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
+ word0 = pool->getPool(idx0);
+ while (word0) {
+ priDelta = dupMaxPriDelta * word0->fontSize;
+ secDelta = dupMaxSecDelta * word0->fontSize;
+ if (rot == 0 || rot == 3) {
+ maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
+ } else {
+ maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
+ }
+ found = gFalse;
+ word1 = word2 = NULL; // make gcc happy
+ for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
+ if (idx1 == idx0) {
+ word1 = word0;
+ word2 = word0->next;
+ } else {
+ word1 = NULL;
+ word2 = pool->getPool(idx1);
+ }
+ for (; word2; word1 = word2, word2 = word2->next) {
+ if (word2->len == word0->len &&
+ !memcmp(word2->text, word0->text,
+ word0->len * sizeof(Unicode))) {
+ switch (rot) {
+ case 0:
+ case 2:
+ found = fabs(word0->xMin - word2->xMin) < priDelta &&
+ fabs(word0->xMax - word2->xMax) < priDelta &&
+ fabs(word0->yMin - word2->yMin) < secDelta &&
+ fabs(word0->yMax - word2->yMax) < secDelta;
+ break;
+ case 1:
+ case 3:
+ found = fabs(word0->xMin - word2->xMin) < secDelta &&
+ fabs(word0->xMax - word2->xMax) < secDelta &&
+ fabs(word0->yMin - word2->yMin) < priDelta &&
+ fabs(word0->yMax - word2->yMax) < priDelta;
+ break;
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+ if (found) {
+ if (word1) {
+ word1->next = word2->next;
+ } else {
+ pool->setPool(idx1, word2->next);
+ }
+ delete word2;
+ } else {
+ word0 = word0->next;
+ }
+ }
+ }
+
+ // build the lines
+ curLine = NULL;
+ poolMinBaseIdx = pool->minBaseIdx;
+ charCount = 0;
+ nLines = 0;
+ while (1) {
+
+ // find the first non-empty line in the pool
+ for (;
+ poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
+ ++poolMinBaseIdx) ;
+ if (poolMinBaseIdx > pool->maxBaseIdx) {
+ break;
+ }
+
+ // look for the left-most word in the first four lines of the
+ // pool -- this avoids starting with a superscript word
+ startBaseIdx = poolMinBaseIdx;
+ for (baseIdx = poolMinBaseIdx + 1;
+ baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
+ ++baseIdx) {
+ if (!pool->getPool(baseIdx)) {
+ continue;
+ }
+ if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
+ < 0) {
+ startBaseIdx = baseIdx;
+ }
+ }
+
+ // create a new line
+ word0 = pool->getPool(startBaseIdx);
+ pool->setPool(startBaseIdx, word0->next);
+ word0->next = NULL;
+ line = new TextLine(this, word0->rot, word0->base);
+ line->addWord(word0);
+ lastWord = word0;
+
+ // compute the search range
+ fontSize = word0->fontSize;
+ minBase = word0->base - maxIntraLineDelta * fontSize;
+ maxBase = word0->base + maxIntraLineDelta * fontSize;
+ minBaseIdx = pool->getBaseIdx(minBase);
+ maxBaseIdx = pool->getBaseIdx(maxBase);
+
+ // find the rest of the words in this line
+ while (1) {
+
+ // find the left-most word whose baseline is in the range for
+ // this line
+ bestWordBaseIdx = 0;
+ bestWord0 = bestWord1 = NULL;
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
+ for (word0 = NULL, word1 = pool->getPool(baseIdx);
+ word1;
+ word0 = word1, word1 = word1->next) {
+ if (word1->base >= minBase &&
+ word1->base <= maxBase &&
+ (delta = lastWord->primaryDelta(word1)) >=
+ minCharSpacing * fontSize) {
+ if (delta < maxWordSpacing * fontSize &&
+ (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
+ bestWordBaseIdx = baseIdx;
+ bestWord0 = word0;
+ bestWord1 = word1;
+ }
+ break;
+ }
+ }
+ }
+ if (!bestWord1) {
+ break;
+ }
+
+ // remove it from the pool, and add it to the line
+ if (bestWord0) {
+ bestWord0->next = bestWord1->next;
+ } else {
+ pool->setPool(bestWordBaseIdx, bestWord1->next);
+ }
+ bestWord1->next = NULL;
+ line->addWord(bestWord1);
+ lastWord = bestWord1;
+ }
+
+ // add the line
+ if (curLine && line->cmpYX(curLine) > 0) {
+ line0 = curLine;
+ line1 = curLine->next;
+ } else {
+ line0 = NULL;
+ line1 = lines;
+ }
+ for (;
+ line1 && line->cmpYX(line1) > 0;
+ line0 = line1, line1 = line1->next) ;
+ if (line0) {
+ line0->next = line;
+ } else {
+ lines = line;
+ }
+ line->next = line1;
+ curLine = line;
+ line->coalesce(uMap);
+ charCount += line->len;
+ ++nLines;
+ }
+
+ // sort lines into xy order for column assignment
+ lineArray = (TextLine **)gmalloc(nLines * sizeof(TextLine *));
+ for (line = lines, i = 0; line; line = line->next, ++i) {
+ lineArray[i] = line;
+ }
+ qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
+
+ // column assignment
+ nColumns = 0;
+ for (i = 0; i < nLines; ++i) {
+ line0 = lineArray[i];
+ col1 = 0;
+ for (j = 0; j < i; ++j) {
+ line1 = lineArray[j];
+ if (line1->primaryDelta(line0) >= 0) {
+ col2 = line1->col[line1->len] + 1;
+ } else {
+ k = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ for (k = 0;
+ k < line1->len &&
+ line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ case 1:
+ for (k = 0;
+ k < line1->len &&
+ line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ case 2:
+ for (k = 0;
+ k < line1->len &&
+ line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ case 3:
+ for (k = 0;
+ k < line1->len &&
+ line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ }
+ col2 = line1->col[k];
+ }
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ }
+ for (k = 0; k <= line0->len; ++k) {
+ line0->col[k] += col1;
+ }
+ if (line0->col[line0->len] > nColumns) {
+ nColumns = line0->col[line0->len];
+ }
+ }
+ gfree(lineArray);
+}
+
+void TextBlock::updatePriMinMax(TextBlock *blk) {
+ double newPriMin, newPriMax;
+ GBool gotPriMin, gotPriMax;
+
+ gotPriMin = gotPriMax = gFalse;
+ newPriMin = newPriMax = 0; // make gcc happy
+ switch (page->primaryRot) {
+ case 0:
+ case 2:
+ if (blk->yMin < yMax && blk->yMax > yMin) {
+ if (blk->xMin < xMin) {
+ newPriMin = blk->xMax;
+ gotPriMin = gTrue;
+ }
+ if (blk->xMax > xMax) {
+ newPriMax = blk->xMin;
+ gotPriMax = gTrue;
+ }
+ }
+ break;
+ case 1:
+ case 3:
+ if (blk->xMin < xMax && blk->xMax > xMin) {
+ if (blk->yMin < yMin) {
+ newPriMin = blk->yMax;
+ gotPriMin = gTrue;
+ }
+ if (blk->yMax > yMax) {
+ newPriMax = blk->yMin;
+ gotPriMax = gTrue;
+ }
+ }
+ break;
+ }
+ if (gotPriMin) {
+ if (newPriMin > xMin) {
+ newPriMin = xMin;
+ }
+ if (newPriMin > priMin) {
+ priMin = newPriMin;
+ }
+ }
+ if (gotPriMax) {
+ if (newPriMax < xMax) {
+ newPriMax = xMax;
+ }
+ if (newPriMax < priMax) {
+ priMax = newPriMax;
+ }
+ }
+}
+
+int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
+ TextBlock *blk1 = *(TextBlock **)p1;
+ TextBlock *blk2 = *(TextBlock **)p2;
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (blk1->page->primaryRot) {
+ case 0:
+ if ((cmp = blk1->xMin - blk2->xMin) == 0) {
+ cmp = blk1->yMin - blk2->yMin;
+ }
+ break;
+ case 1:
+ if ((cmp = blk1->yMin - blk2->yMin) == 0) {
+ cmp = blk2->xMax - blk1->xMax;
+ }
+ break;
+ case 2:
+ if ((cmp = blk2->xMax - blk1->xMax) == 0) {
+ cmp = blk2->yMin - blk1->yMin;
+ }
+ break;
+ case 3:
+ if ((cmp = blk2->yMax - blk1->yMax) == 0) {
+ cmp = blk1->xMax - blk2->xMax;
+ }
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
+ TextBlock *blk1 = *(TextBlock **)p1;
+ TextBlock *blk2 = *(TextBlock **)p2;
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (blk1->page->primaryRot) {
+ case 0:
+ if ((cmp = blk1->yMin - blk2->yMin) == 0) {
+ cmp = blk1->xMin - blk2->xMin;
+ }
+ break;
+ case 1:
+ if ((cmp = blk2->xMax - blk1->xMax) == 0) {
+ cmp = blk1->yMin - blk2->yMin;
+ }
+ break;
+ case 2:
+ if ((cmp = blk2->yMin - blk1->yMin) == 0) {
+ cmp = blk2->xMax - blk1->xMax;
+ }
+ break;
+ case 3:
+ if ((cmp = blk1->xMax - blk2->xMax) == 0) {
+ cmp = blk2->yMax - blk1->yMax;
+ }
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextBlock::primaryCmp(TextBlock *blk) {
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ cmp = xMin - blk->xMin;
+ break;
+ case 1:
+ cmp = yMin - blk->yMin;
+ break;
+ case 2:
+ cmp = blk->xMax - xMax;
+ break;
+ case 3:
+ cmp = blk->yMax - yMax;
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+double TextBlock::secondaryDelta(TextBlock *blk) {
+ double delta;
+
+ delta = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ delta = blk->yMin - yMax;
+ break;
+ case 1:
+ delta = xMin - blk->xMax;
+ break;
+ case 2:
+ delta = yMin - blk->yMax;
+ break;
+ case 3:
+ delta = blk->xMin - xMax;
+ break;
+ }
+ return delta;
+}
+
+GBool TextBlock::isBelow(TextBlock *blk) {
+ GBool below;
+
+ below = gFalse; // make gcc happy
+ switch (page->primaryRot) {
+ case 0:
+ below = xMin >= blk->priMin && xMax <= blk->priMax &&
+ yMin > blk->yMin;
+ break;
+ case 1:
+ below = yMin >= blk->priMin && yMax <= blk->priMax &&
+ xMax < blk->xMax;
+ break;
+ case 2:
+ below = xMin >= blk->priMin && xMax <= blk->priMax &&
+ yMax < blk->yMax;
+ break;
+ case 3:
+ below = yMin >= blk->priMin && yMax <= blk->priMax &&
+ xMin > blk->xMin;
+ break;
+ }
+
+ return below;
+}
+
+//------------------------------------------------------------------------
+// TextFlow
+//------------------------------------------------------------------------
+
+TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
+ page = pageA;
+ xMin = blk->xMin;
+ xMax = blk->xMax;
+ yMin = blk->yMin;
+ yMax = blk->yMax;
+ priMin = blk->priMin;
+ priMax = blk->priMax;
+ blocks = lastBlk = blk;
+ next = NULL;
+}
+
+TextFlow::~TextFlow() {
+ TextBlock *blk;
+
+ while (blocks) {
+ blk = blocks;
+ blocks = blocks->next;
+ delete blk;
+ }
+}
+
+void TextFlow::addBlock(TextBlock *blk) {
+ if (lastBlk) {
+ lastBlk->next = blk;
+ } else {
+ blocks = blk;
+ }
+ lastBlk = blk;
+ if (blk->xMin < xMin) {
+ xMin = blk->xMin;
+ }
+ if (blk->xMax > xMax) {
+ xMax = blk->xMax;
+ }
+ if (blk->yMin < yMin) {
+ yMin = blk->yMin;
+ }
+ if (blk->yMax > yMax) {
+ yMax = blk->yMax;
+ }
+}
+
+GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
+ GBool fits;
+
+ // lower blocks must use smaller fonts
+ if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
+ return gFalse;
+ }
+
+ fits = gFalse; // make gcc happy
+ switch (page->primaryRot) {
+ case 0:
+ fits = blk->xMin >= priMin && blk->xMax <= priMax;
+ break;
+ case 1:
+ fits = blk->yMin >= priMin && blk->yMax <= priMax;
+ break;
+ case 2:
+ fits = blk->xMin >= priMin && blk->xMax <= priMax;
+ break;
+ case 3:
+ fits = blk->yMin >= priMin && blk->yMax <= priMax;
+ break;
+ }
+ return fits;
+}
+
+#if TEXTOUT_WORD_LIST
+
+//------------------------------------------------------------------------
+// TextWordList
+//------------------------------------------------------------------------
+
+TextWordList::TextWordList(TextPage *text, GBool physLayout) {
+ TextFlow *flow;
+ TextBlock *blk;
+ TextLine *line;
+ TextWord *word;
+ TextWord **wordArray;
+ int nWords, i;
+
+ words = new GList();
+
+ if (text->rawOrder) {
+ for (word = text->rawWords; word; word = word->next) {
+ words->append(word);
+ }
+
+ } else if (physLayout) {
+ // this is inefficient, but it's also the least useful of these
+ // three cases
+ nWords = 0;
+ for (flow = text->flows; flow; flow = flow->next) {
+ for (blk = flow->blocks; blk; blk = blk->next) {
+ for (line = blk->lines; line; line = line->next) {
+ for (word = line->words; word; word = word->next) {
+ ++nWords;
+ }
+ }
+ }
+ }
+ wordArray = (TextWord **)gmalloc(nWords * sizeof(TextWord *));
+ i = 0;
+ for (flow = text->flows; flow; flow = flow->next) {
+ for (blk = flow->blocks; blk; blk = blk->next) {
+ for (line = blk->lines; line; line = line->next) {
+ for (word = line->words; word; word = word->next) {
+ wordArray[i++] = word;
+ }
+ }
+ }
+ }
+ qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
+ for (i = 0; i < nWords; ++i) {
+ words->append(wordArray[i]);
+ }
+ gfree(wordArray);
+
+ } else {
+ for (flow = text->flows; flow; flow = flow->next) {
+ for (blk = flow->blocks; blk; blk = blk->next) {
+ for (line = blk->lines; line; line = line->next) {
+ for (word = line->words; word; word = word->next) {
+ words->append(word);
+ }
+ }
+ }
+ }
+ }
+}
+
+TextWordList::~TextWordList() {
+ delete words;
+}
+
+int TextWordList::getLength() {
+ return words->getLength();
+}
+
+TextWord *TextWordList::get(int idx) {
+ if (idx < 0 || idx >= words->getLength()) {
+ return NULL;
+ }
+ return (TextWord *)words->get(idx);
+}
+
+#endif // TEXTOUT_WORD_LIST
+
+//------------------------------------------------------------------------
+// TextPage
+//------------------------------------------------------------------------
+
+TextPage::TextPage(GBool rawOrderA) {
+ int rot;
+
+ rawOrder = rawOrderA;
+ curWord = NULL;
+ charPos = 0;
+ curFont = NULL;
+ curFontSize = 0;
+ nest = 0;
+ nTinyChars = 0;
+ lastCharOverlap = gFalse;
+ if (!rawOrder) {
+ for (rot = 0; rot < 4; ++rot) {
+ pools[rot] = new TextPool();
+ }
+ }
+ flows = NULL;
+ blocks = NULL;
+ rawWords = NULL;
+ rawLastWord = NULL;
+ fonts = new GList();
+ lastFindXMin = lastFindYMin = 0;
+ haveLastFind = gFalse;
+}
+
+TextPage::~TextPage() {
+ int rot;
+
+ clear();
+ if (!rawOrder) {
+ for (rot = 0; rot < 4; ++rot) {
+ delete pools[rot];
+ }
+ }
+ delete fonts;
+}
+
+void TextPage::startPage(GfxState *state) {
+ clear();
+ if (state) {
+ pageWidth = state->getPageWidth();
+ pageHeight = state->getPageHeight();
+ } else {
+ pageWidth = pageHeight = 0;
+ }
+}
+
+void TextPage::endPage() {
+ if (curWord) {
+ endWord();
+ }
+}
+
+void TextPage::clear() {
+ int rot;
+ TextFlow *flow;
+ TextWord *word;
+
+ if (curWord) {
+ delete curWord;
+ curWord = NULL;
+ }
+ if (rawOrder) {
+ while (rawWords) {
+ word = rawWords;
+ rawWords = rawWords->next;
+ delete word;
+ }
+ } else {
+ for (rot = 0; rot < 4; ++rot) {
+ delete pools[rot];
+ }
+ while (flows) {
+ flow = flows;
+ flows = flows->next;
+ delete flow;
+ }
+ gfree(blocks);
+ }
+ deleteGList(fonts, TextFontInfo);
+
+ curWord = NULL;
+ charPos = 0;
+ curFont = NULL;
+ curFontSize = 0;
+ nest = 0;
+ nTinyChars = 0;
+ if (!rawOrder) {
+ for (rot = 0; rot < 4; ++rot) {
+ pools[rot] = new TextPool();
+ }
+ }
+ flows = NULL;
+ blocks = NULL;
+ rawWords = NULL;
+ rawLastWord = NULL;
+ fonts = new GList();
+}
+
+void TextPage::updateFont(GfxState *state) {
+ GfxFont *gfxFont;
+ double *fm;
+ char *name;
+ int code, mCode, letterCode, anyCode;
+ double w;
+ int i;
+
+ // get the font info object
+ curFont = NULL;
+ for (i = 0; i < fonts->getLength(); ++i) {
+ curFont = (TextFontInfo *)fonts->get(i);
+ if (curFont->matches(state)) {
+ break;
+ }
+ curFont = NULL;
+ }
+ if (!curFont) {
+ curFont = new TextFontInfo(state);
+ fonts->append(curFont);
+ }
+
+ // adjust the font size
+ gfxFont = state->getFont();
+ curFontSize = state->getTransformedFontSize();
+ if (gfxFont && gfxFont->getType() == fontType3) {
+ // This is a hack which makes it possible to deal with some Type 3
+ // fonts. The problem is that it's impossible to know what the
+ // base coordinate system used in the font is without actually
+ // rendering the font. This code tries to guess by looking at the
+ // width of the character 'm' (which breaks if the font is a
+ // subset that doesn't contain 'm').
+ mCode = letterCode = anyCode = -1;
+ for (code = 0; code < 256; ++code) {
+ name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
+ if (name && name[0] == 'm' && name[1] == '\0') {
+ mCode = code;
+ }
+ if (letterCode < 0 && name && name[1] == '\0' &&
+ ((name[0] >= 'A' && name[0] <= 'Z') ||
+ (name[0] >= 'a' && name[0] <= 'z'))) {
+ letterCode = code;
+ }
+ if (anyCode < 0 && name &&
+ ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
+ anyCode = code;
+ }
+ }
+ if (mCode >= 0 &&
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
+ // 0.6 is a generic average 'm' width -- yes, this is a hack
+ curFontSize *= w / 0.6;
+ } else if (letterCode >= 0 &&
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
+ // even more of a hack: 0.5 is a generic letter width
+ curFontSize *= w / 0.5;
+ } else if (anyCode >= 0 &&
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
+ // better than nothing: 0.5 is a generic character width
+ curFontSize *= w / 0.5;
+ }
+ fm = gfxFont->getFontMatrix();
+ if (fm[0] != 0) {
+ curFontSize *= fabs(fm[3] / fm[0]);
+ }
+ }
+}
+
+void TextPage::beginWord(GfxState *state, double x0, double y0) {
+ double *txtm, *ctm, *fontm;
+ double m[4], m2[4];
+ int rot;
+
+ // This check is needed because Type 3 characters can contain
+ // text-drawing operations (when TextPage is being used via
+ // {X,Win}SplashOutputDev rather than TextOutputDev).
+ if (curWord) {
+ ++nest;
+ return;
+ }
+
+ // compute the rotation
+ txtm = state->getTextMat();
+ ctm = state->getCTM();
+ m[0] = txtm[0] * ctm[0] + txtm[1] * ctm[2];
+ m[1] = txtm[0] * ctm[1] + txtm[1] * ctm[3];
+ m[2] = txtm[2] * ctm[0] + txtm[3] * ctm[2];
+ m[3] = txtm[2] * ctm[1] + txtm[3] * ctm[3];
+ if (state->getFont()->getType() == fontType3) {
+ fontm = state->getFont()->getFontMatrix();
+ m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
+ m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
+ m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
+ m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
+ m[0] = m2[0];
+ m[1] = m2[1];
+ m[2] = m2[2];
+ m[3] = m2[3];
+ }
+ if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
+ rot = (m[3] < 0) ? 0 : 2;
+ } else {
+ rot = (m[2] > 0) ? 1 : 3;
+ }
+
+ curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
+}
+
+void TextPage::addChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ CharCode c, Unicode *u, int uLen) {
+ double x1, y1, w1, h1, dx2, dy2, base, sp;
+ int i;
+
+ // if the previous char was a space, addChar will have called
+ // endWord, so we need to start a new word
+ if (!curWord) {
+ beginWord(state, x, y);
+ }
+
+ // throw away chars that aren't inside the page bounds
+ state->transform(x, y, &x1, &y1);
+ if (x1 < 0 || x1 > pageWidth ||
+ y1 < 0 || y1 > pageHeight) {
+ return;
+ }
+
+ // subtract char and word spacing from the dx,dy values
+ sp = state->getCharSpace();
+ if (c == (CharCode)0x20) {
+ sp += state->getWordSpace();
+ }
+ state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
+ dx -= dx2;
+ dy -= dy2;
+ state->transformDelta(dx, dy, &w1, &h1);
+
+ // check the tiny chars limit
+ if (!globalParams->getTextKeepTinyChars() &&
+ fabs(w1) < 3 && fabs(h1) < 3) {
+ if (++nTinyChars > 50000) {
+ return;
+ }
+ }
+
+ // break words at space character
+ if (uLen == 1 && u[0] == (Unicode)0x20) {
+ ++curWord->charLen;
+ ++charPos;
+ endWord();
+ return;
+ }
+
+ // start a new word if:
+ // (1) this character's baseline doesn't match the current word's
+ // baseline, or
+ // (2) there is space between the end of the current word and this
+ // character, or
+ // (3) this character overlaps the previous one (duplicated text), or
+ // (4) the previous character was an overlap (we want each duplicated
+ // characters to be in a word by itself)
+ base = sp = 0; // make gcc happy
+ if (curWord->len > 0) {
+ switch (curWord->rot) {
+ case 0:
+ base = y1;
+ sp = x1 - curWord->xMax;
+ break;
+ case 1:
+ base = x1;
+ sp = y1 - curWord->yMax;
+ break;
+ case 2:
+ base = y1;
+ sp = curWord->xMin - x1;
+ break;
+ case 3:
+ base = x1;
+ sp = curWord->yMin - y1;
+ break;
+ }
+ if (fabs(base - curWord->base) > 0.5 ||
+ sp > minWordBreakSpace * curWord->fontSize ||
+ sp < -minDupBreakOverlap * curWord->fontSize ||
+ lastCharOverlap) {
+ lastCharOverlap = gTrue;
+ endWord();
+ beginWord(state, x, y);
+ } else {
+ lastCharOverlap = gFalse;
+ }
+ } else {
+ lastCharOverlap = gFalse;
+ }
+
+ // page rotation and/or transform matrices can cause text to be
+ // drawn in reverse order -- in this case, swap the begin/end
+ // coordinates and break text into individual chars
+ if ((curWord->rot == 0 && w1 < 0) ||
+ (curWord->rot == 1 && h1 < 0) ||
+ (curWord->rot == 2 && w1 > 0) ||
+ (curWord->rot == 3 && h1 > 0)) {
+ endWord();
+ beginWord(state, x + dx, y + dy);
+ x1 += w1;
+ y1 += h1;
+ w1 = -w1;
+ h1 = -h1;
+ }
+
+ // add the characters to the current word
+ if (uLen != 0) {
+ w1 /= uLen;
+ h1 /= uLen;
+ }
+ for (i = 0; i < uLen; ++i) {
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
+ }
+ ++curWord->charLen;
+ ++charPos;
+}
+
+void TextPage::endWord() {
+ // This check is needed because Type 3 characters can contain
+ // text-drawing operations (when TextPage is being used via
+ // {X,Win}SplashOutputDev rather than TextOutputDev).
+ if (nest > 0) {
+ --nest;
+ return;
+ }
+
+ if (curWord) {
+ addWord(curWord);
+ curWord = NULL;
+ }
+}
+
+void TextPage::addWord(TextWord *word) {
+ // throw away zero-length words -- they don't have valid xMin/xMax
+ // values, and they're useless anyway
+ if (word->len == 0) {
+ delete word;
+ return;
+ }
+
+ if (rawOrder) {
+ if (rawLastWord) {
+ rawLastWord->next = word;
+ } else {
+ rawWords = word;
+ }
+ rawLastWord = word;
+ } else {
+ pools[word->rot]->addWord(word);
+ }
+}
+
+void TextPage::coalesce(GBool physLayout) {
+ UnicodeMap *uMap;
+ TextPool *pool;
+ TextWord *word0, *word1, *word2;
+ TextLine *line;
+ TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;
+ TextBlock **blkArray;
+ TextFlow *flow, *lastFlow;
+ int rot, poolMinBaseIdx, baseIdx, startBaseIdx;
+ double minBase, maxBase, newMinBase, newMaxBase;
+ double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
+ GBool found;
+ int count[4];
+ int lrCount;
+ int firstBlkIdx, nBlocksLeft;
+ int col1, col2;
+ int i, j, n;
+
+ if (rawOrder) {
+ primaryRot = 0;
+ primaryLR = gTrue;
+ return;
+ }
+
+ uMap = globalParams->getTextEncoding();
+ blkList = NULL;
+ lastBlk = NULL;
+ nBlocks = 0;
+ primaryRot = -1;
+
+#if 0 // for debugging
+ printf("*** initial words ***\n");
+ for (rot = 0; rot < 4; ++rot) {
+ pool = pools[rot];
+ for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
+ for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->base, word0->fontSize);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ }
+ printf("\n");
+#endif
+
+ //----- assemble the blocks
+
+ //~ add an outer loop for writing mode (vertical text)
+
+ // build blocks for each rotation value
+ for (rot = 0; rot < 4; ++rot) {
+ pool = pools[rot];
+ poolMinBaseIdx = pool->minBaseIdx;
+ count[rot] = 0;
+
+ // add blocks until no more words are left
+ while (1) {
+
+ // find the first non-empty line in the pool
+ for (;
+ poolMinBaseIdx <= pool->maxBaseIdx &&
+ !pool->getPool(poolMinBaseIdx);
+ ++poolMinBaseIdx) ;
+ if (poolMinBaseIdx > pool->maxBaseIdx) {
+ break;
+ }
+
+ // look for the left-most word in the first four lines of the
+ // pool -- this avoids starting with a superscript word
+ startBaseIdx = poolMinBaseIdx;
+ for (baseIdx = poolMinBaseIdx + 1;
+ baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
+ ++baseIdx) {
+ if (!pool->getPool(baseIdx)) {
+ continue;
+ }
+ if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
+ < 0) {
+ startBaseIdx = baseIdx;
+ }
+ }
+
+ // create a new block
+ word0 = pool->getPool(startBaseIdx);
+ pool->setPool(startBaseIdx, word0->next);
+ word0->next = NULL;
+ blk = new TextBlock(this, rot);
+ blk->addWord(word0);
+
+ fontSize = word0->fontSize;
+ minBase = maxBase = word0->base;
+ colSpace1 = minColSpacing1 * fontSize;
+ colSpace2 = minColSpacing2 * fontSize;
+ lineSpace = maxLineSpacingDelta * fontSize;
+ intraLineSpace = maxIntraLineDelta * fontSize;
+
+ // add words to the block
+ do {
+ found = gFalse;
+
+ // look for words on the line above the current top edge of
+ // the block
+ newMinBase = minBase;
+ for (baseIdx = pool->getBaseIdx(minBase);
+ baseIdx >= pool->getBaseIdx(minBase - lineSpace);
+ --baseIdx) {
+ word0 = NULL;
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base < minBase &&
+ word1->base >= minBase - lineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
+ : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta1 * fontSize) {
+ word2 = word1;
+ if (word0) {
+ word0->next = word1->next;
+ } else {
+ pool->setPool(baseIdx, word1->next);
+ }
+ word1 = word1->next;
+ word2->next = NULL;
+ blk->addWord(word2);
+ found = gTrue;
+ newMinBase = word2->base;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
+ }
+ }
+ }
+ minBase = newMinBase;
+
+ // look for words on the line below the current bottom edge of
+ // the block
+ newMaxBase = maxBase;
+ for (baseIdx = pool->getBaseIdx(maxBase);
+ baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
+ ++baseIdx) {
+ word0 = NULL;
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base > maxBase &&
+ word1->base <= maxBase + lineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
+ : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta1 * fontSize) {
+ word2 = word1;
+ if (word0) {
+ word0->next = word1->next;
+ } else {
+ pool->setPool(baseIdx, word1->next);
+ }
+ word1 = word1->next;
+ word2->next = NULL;
+ blk->addWord(word2);
+ found = gTrue;
+ newMaxBase = word2->base;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
+ }
+ }
+ }
+ maxBase = newMaxBase;
+
+ // look for words that are on lines already in the block, and
+ // that overlap the block horizontally
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
+ ++baseIdx) {
+ word0 = NULL;
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base >= minBase - intraLineSpace &&
+ word1->base <= maxBase + intraLineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMin < blk->xMax + colSpace1 &&
+ word1->xMax > blk->xMin - colSpace1)
+ : (word1->yMin < blk->yMax + colSpace1 &&
+ word1->yMax > blk->yMin - colSpace1)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta2 * fontSize) {
+ word2 = word1;
+ if (word0) {
+ word0->next = word1->next;
+ } else {
+ pool->setPool(baseIdx, word1->next);
+ }
+ word1 = word1->next;
+ word2->next = NULL;
+ blk->addWord(word2);
+ found = gTrue;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
+ }
+ }
+ }
+
+ // only check for outlying words (the next two chunks of code)
+ // if we didn't find anything else
+ if (found) {
+ continue;
+ }