+#ifdef MACOS
+// needed for setting type/creator of MacOS files
+#include "ICSupport.h"
+#endif
+
+//------------------------------------------------------------------------
+// parameters
+//------------------------------------------------------------------------
+
+// Minium and maximum inter-word spacing (as a fraction of the average
+// character width).
+#define wordMinSpaceWidth 0.3
+#define wordMaxSpaceWidth 2.0
+
+// Default min and max inter-word spacing (when the average character
+// width is unknown).
+#define wordDefMinSpaceWidth 0.2
+#define wordDefMaxSpaceWidth 1.5
+
+// Max difference in x,y coordinates (as a fraction of the font size)
+// allowed for duplicated text (fake boldface, drop shadows) which is
+// to be discarded.
+#define dupMaxDeltaX 0.1
+#define dupMaxDeltaY 0.2
+
+// Min overlap (as a fraction of the font size) required for two
+// lines to be considered vertically overlapping.
+#define lineOverlapSlack 0.5
+
+// Max difference in baseline y coordinates (as a fraction of the font
+// size) allowed for words which are to be grouped into a line, not
+// including sub/superscripts.
+#define lineMaxBaselineDelta 0.1
+
+// Max ratio of font sizes allowed for words which are to be grouped
+// into a line, not including sub/superscripts.
+#define lineMaxFontSizeRatio 1.4
+
+// Min spacing (as a fraction of the font size) allowed between words
+// which are to be grouped into a line.
+#define lineMinDeltaX -0.5
+
+// Minimum vertical overlap (as a fraction of the font size) required
+// for superscript and subscript words.
+#define lineMinSuperscriptOverlap 0.3
+#define lineMinSubscriptOverlap 0.3
+
+// Min/max ratio of font sizes allowed for sub/superscripts compared to
+// the base text.
+#define lineMinSubscriptFontSizeRatio 0.4
+#define lineMaxSubscriptFontSizeRatio 1.01
+#define lineMinSuperscriptFontSizeRatio 0.4
+#define lineMaxSuperscriptFontSizeRatio 1.01
+
+// Max horizontal spacing (as a fraction of the font size) allowed
+// before sub/superscripts.
+#define lineMaxSubscriptDeltaX 0.2
+#define lineMaxSuperscriptDeltaX 0.2
+
+// Maximum vertical spacing (as a fraction of the font size) allowed
+// for lines which are to be grouped into a block.
+#define blkMaxSpacing 2.0
+
+// Max ratio of primary font sizes allowed for lines which are to be
+// grouped into a block.
+#define blkMaxFontSizeRatio 1.3
+
+// Min overlap (as a fraction of the font size) required for two
+// blocks to be considered vertically overlapping.
+#define blkOverlapSlack 0.5
+
+// Max vertical spacing (as a fraction of the font size) allowed
+// between blocks which are 'adjacent' when sorted by reading order.
+#define blkMaxSortSpacing 2.0
+
+// Max vertical offset (as a fraction of the font size) of the top and
+// bottom edges allowed for blocks which are to be grouped into a
+// flow.
+#define flowMaxDeltaY 1.0
+
+//------------------------------------------------------------------------
+// TextFontInfo
+//------------------------------------------------------------------------
+
+TextFontInfo::TextFontInfo(GfxState *state) {
+ double *textMat;
+ double t1, t2, avgWidth, w;
+ int n, i;
+
+ gfxFont = state->getFont();
+ textMat = state->getTextMat();
+ horizScaling = state->getHorizScaling();
+ if ((t1 = fabs(textMat[0])) > 0.01 &&
+ (t2 = fabs(textMat[3])) > 0.01) {
+ horizScaling *= t1 / t2;
+ }
+
+ minSpaceWidth = horizScaling * wordDefMinSpaceWidth;
+ maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth;
+ if (gfxFont && gfxFont->isCIDFont()) {
+ //~ handle 16-bit fonts
+ } else if (gfxFont && gfxFont->getType() != fontType3) {
+ avgWidth = 0;
+ n = 0;
+ for (i = 0; i < 256; ++i) {
+ w = ((Gfx8BitFont *)gfxFont)->getWidth(i);
+ if (w > 0) {
+ avgWidth += w;
+ ++n;
+ }
+ }
+ if (n > 0) {
+ avgWidth /= n;
+ minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth;
+ maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth;
+ }
+ }
+
+}
+
+TextFontInfo::~TextFontInfo() {
+}
+
+GBool TextFontInfo::matches(GfxState *state) {
+ double *textMat;
+ double t1, t2, h;
+
+ textMat = state->getTextMat();
+ h = state->getHorizScaling();
+ if ((t1 = fabs(textMat[0])) > 0.01 &&
+ (t2 = fabs(textMat[3])) > 0.01) {
+ h *= t1 / t2;
+ }
+ return state->getFont() == gfxFont &&
+ fabs(h - horizScaling) < 0.01;
+}
+
+//------------------------------------------------------------------------
+// TextWord
+//------------------------------------------------------------------------
+
+TextWord::TextWord(GfxState *state, double x0, double y0, int charPosA,
+ TextFontInfo *fontA, double fontSizeA) {
+ GfxFont *gfxFont;
+ double x, y;
+
+ charPos = charPosA;
+ charLen = 0;
+ font = fontA;
+ fontSize = fontSizeA;
+ state->transform(x0, y0, &x, &y);
+ if ((gfxFont = font->gfxFont)) {
+ yMin = y - gfxFont->getAscent() * fontSize;
+ yMax = y - gfxFont->getDescent() * fontSize;
+ } else {
+ // this means that the PDF file draws text without a current font,
+ // which should never happen
+ yMin = y - 0.95 * fontSize;
+ yMax = y + 0.35 * fontSize;
+ }
+ if (yMin == yMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ yMin = y;
+ yMax = y + 1;
+ }
+ yBase = y;
+ text = NULL;
+ xRight = NULL;
+ len = size = 0;
+ spaceAfter = gFalse;
+ next = NULL;
+
+}
+
+
+TextWord::~TextWord() {
+ gfree(text);
+ gfree(xRight);
+}
+
+void TextWord::addChar(GfxState *state, double x, double y,
+ double dx, double dy, Unicode u) {
+ if (len == size) {
+ size += 16;
+ text = (Unicode *)grealloc(text, size * sizeof(Unicode));
+ xRight = (double *)grealloc(xRight, size * sizeof(double));
+ }
+ text[len] = u;
+ if (len == 0) {
+ xMin = x;
+ }
+ xMax = xRight[len] = x + dx;
+ ++len;
+}
+
+// Returns true if <this> comes before <word2> in xy order.
+GBool TextWord::xyBefore(TextWord *word2) {
+ return xMin < word2->xMin ||
+ (xMin == word2->xMin && yMin < word2->yMin);
+}
+
+// Merge another word onto the end of this one.
+void TextWord::merge(TextWord *word2) {
+ int i;
+
+ xMax = word2->xMax;
+ if (word2->yMin < yMin) {
+ yMin = word2->yMin;
+ }
+ if (word2->yMax > yMax) {
+ yMax = word2->yMax;
+ }
+ if (len + word2->len > size) {
+ size = len + word2->len;
+ text = (Unicode *)grealloc(text, size * sizeof(Unicode));
+ xRight = (double *)grealloc(xRight, size * sizeof(double));
+ }
+ for (i = 0; i < word2->len; ++i) {
+ text[len + i] = word2->text[i];
+ xRight[len + i] = word2->xRight[i];
+ }
+ len += word2->len;
+ charLen += word2->charLen;
+}
+
+//------------------------------------------------------------------------
+// TextLine
+//------------------------------------------------------------------------
+
+TextLine::TextLine() {
+ words = NULL;
+ text = NULL;
+ xRight = NULL;
+ col = NULL;
+ len = 0;
+ hyphenated = gFalse;
+ pageNext = NULL;
+ next = NULL;
+ flowNext = NULL;
+}
+
+TextLine::~TextLine() {
+ TextWord *w1, *w2;
+
+ for (w1 = words; w1; w1 = w2) {
+ w2 = w1->next;
+ delete w1;
+ }
+ gfree(text);
+ gfree(xRight);
+ gfree(col);
+}
+
+// Returns true if <this> comes before <line2> in yx order, allowing
+// slack for vertically overlapping lines.
+GBool TextLine::yxBefore(TextLine *line2) {
+ double dy;
+
+ dy = lineOverlapSlack * fontSize;
+
+ // non-overlapping case
+ if (line2->yMin > yMax - dy ||
+ line2->yMax < yMin + dy) {
+ return yMin < line2->yMin ||
+ (yMin == line2->yMin && xMin < line2->xMin);
+ }
+
+ // overlapping case
+ return xMin < line2->xMin;
+}
+
+// Merge another line's words onto the end of this line.
+void TextLine::merge(TextLine *line2) {
+ int newLen, i;
+
+ xMax = line2->xMax;
+ if (line2->yMin < yMin) {
+ yMin = line2->yMin;
+ }
+ if (line2->yMax > yMax) {
+ yMax = line2->yMax;
+ }
+ xSpaceR = line2->xSpaceR;
+ lastWord->spaceAfter = gTrue;
+ lastWord->next = line2->words;
+ lastWord = line2->lastWord;
+ line2->words = NULL;
+ newLen = len + 1 + line2->len;
+ text = (Unicode *)grealloc(text, newLen * sizeof(Unicode));
+ xRight = (double *)grealloc(xRight, newLen * sizeof(double));
+ text[len] = (Unicode)0x0020;
+ xRight[len] = line2->xMin;
+ for (i = 0; i < line2->len; ++i) {
+ text[len + 1 + i] = line2->text[i];
+ xRight[len + 1 + i] = line2->xRight[i];
+ }
+ len = newLen;
+ convertedLen += line2->convertedLen;
+ hyphenated = line2->hyphenated;
+}
+
+//------------------------------------------------------------------------
+// TextBlock
+//------------------------------------------------------------------------
+
+TextBlock::TextBlock() {
+ lines = NULL;
+ next = NULL;
+}
+
+TextBlock::~TextBlock() {
+ TextLine *l1, *l2;
+
+ for (l1 = lines; l1; l1 = l2) {
+ l2 = l1->next;
+ delete l1;
+ }
+}
+
+// Returns true if <this> comes before <blk2> in xy order, allowing
+// slack for vertically overlapping blocks.
+GBool TextBlock::yxBefore(TextBlock *blk2) {
+ double dy;
+
+ dy = blkOverlapSlack * lines->fontSize;
+
+ // non-overlapping case
+ if (blk2->yMin > yMax - dy ||
+ blk2->yMax < yMin + dy) {
+ return yMin < blk2->yMin ||
+ (yMin == blk2->yMin && xMin < blk2->xMin);
+ }
+
+ // overlapping case
+ return xMin < blk2->xMin;
+}
+
+// Merge another block's line onto the right of this one.
+void TextBlock::mergeRight(TextBlock *blk2) {
+ lines->merge(blk2->lines);
+ xMax = lines->xMax;
+ yMin = lines->yMin;
+ yMax = lines->yMax;
+ xSpaceR = lines->xSpaceR;
+}
+
+// Merge another block's lines onto the bottom of this block.
+void TextBlock::mergeBelow(TextBlock *blk2) {
+ TextLine *line;
+
+ if (blk2->xMin < xMin) {
+ xMin = blk2->xMin;
+ }
+ if (blk2->xMax > xMax) {
+ xMax = blk2->xMax;
+ }
+ yMax = blk2->yMax;
+ if (blk2->xSpaceL > xSpaceL) {
+ xSpaceL = blk2->xSpaceL;
+ }
+ if (blk2->xSpaceR < xSpaceR) {
+ xSpaceR = blk2->xSpaceR;
+ }
+ if (blk2->maxFontSize > maxFontSize) {
+ maxFontSize = blk2->maxFontSize;
+ }
+ for (line = lines; line->next; line = line->next) ;
+ line->next = line->flowNext = blk2->lines;
+ blk2->lines = NULL;
+}
+
+//------------------------------------------------------------------------
+// TextFlow
+//------------------------------------------------------------------------
+
+TextFlow::TextFlow() {
+ blocks = NULL;
+ next = NULL;
+}
+
+TextFlow::~TextFlow() {
+ TextBlock *b1, *b2;
+
+ for (b1 = blocks; b1; b1 = b2) {
+ b2 = b1->next;
+ delete b1;
+ }
+}
+
+
+//------------------------------------------------------------------------
+// TextPage
+//------------------------------------------------------------------------
+
+TextPage::TextPage(GBool rawOrderA) {
+ rawOrder = rawOrderA;
+ curWord = NULL;
+ charPos = 0;
+ font = NULL;
+ fontSize = 0;
+ nest = 0;
+ nTinyChars = 0;
+ words = wordPtr = NULL;
+ lines = NULL;
+ flows = NULL;
+ fonts = new GList();
+}
+
+TextPage::~TextPage() {
+ clear();
+ delete fonts;
+}
+
+void TextPage::updateFont(GfxState *state) {
+ GfxFont *gfxFont;
+ double *fm;
+ char *name;
+ int code, mCode, letterCode, anyCode;
+ double w;
+ int i;
+
+ // get the font info object
+ font = NULL;
+ for (i = 0; i < fonts->getLength(); ++i) {
+ font = (TextFontInfo *)fonts->get(i);
+ if (font->matches(state)) {
+ break;
+ }
+ font = NULL;
+ }
+ if (!font) {
+ font = new TextFontInfo(state);
+ fonts->append(font);
+ }
+
+ // adjust the font size
+ gfxFont = state->getFont();
+ fontSize = state->getTransformedFontSize();
+ if (gfxFont && gfxFont->getType() == fontType3) {
+ // This is a hack which makes it possible to deal with some Type 3
+ // fonts. The problem is that it's impossible to know what the
+ // base coordinate system used in the font is without actually
+ // rendering the font. This code tries to guess by looking at the
+ // width of the character 'm' (which breaks if the font is a
+ // subset that doesn't contain 'm').
+ mCode = letterCode = anyCode = -1;
+ for (code = 0; code < 256; ++code) {
+ name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
+ if (name && name[0] == 'm' && name[1] == '\0') {
+ mCode = code;
+ }
+ if (letterCode < 0 && name && name[1] == '\0' &&
+ ((name[0] >= 'A' && name[0] <= 'Z') ||
+ (name[0] >= 'a' && name[0] <= 'z'))) {
+ letterCode = code;
+ }
+ if (anyCode < 0 && name &&
+ ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
+ anyCode = code;
+ }
+ }
+ if (mCode >= 0 &&
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
+ // 0.6 is a generic average 'm' width -- yes, this is a hack
+ fontSize *= w / 0.6;
+ } else if (letterCode >= 0 &&
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
+ // even more of a hack: 0.5 is a generic letter width
+ fontSize *= w / 0.5;
+ } else if (anyCode >= 0 &&
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
+ // better than nothing: 0.5 is a generic character width
+ fontSize *= w / 0.5;
+ }
+ fm = gfxFont->getFontMatrix();
+ if (fm[0] != 0) {
+ fontSize *= fabs(fm[3] / fm[0]);
+ }
+ }
+}
+
+void TextPage::beginWord(GfxState *state, double x0, double y0) {
+ // This check is needed because Type 3 characters can contain
+ // text-drawing operations (when TextPage is being used via
+ // XOutputDev rather than TextOutputDev).
+ if (curWord) {
+ ++nest;
+ return;
+ }
+
+ curWord = new TextWord(state, x0, y0, charPos, font, fontSize);
+}
+
+void TextPage::addChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ CharCode c, Unicode *u, int uLen) {
+ double x1, y1, w1, h1, dx2, dy2, sp;
+ int n, i;
+
+ // if the previous char was a space, addChar will have called
+ // endWord, so we need to start a new word
+ if (!curWord) {
+ beginWord(state, x, y);
+ }
+
+ // throw away chars that aren't inside the page bounds
+ state->transform(x, y, &x1, &y1);
+ if (x1 < 0 || x1 > pageWidth ||
+ y1 < 0 || y1 > pageHeight) {
+ return;
+ }
+
+ // subtract char and word spacing from the dx,dy values
+ sp = state->getCharSpace();
+ if (c == (CharCode)0x20) {
+ sp += state->getWordSpace();
+ }
+ state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
+ dx -= dx2;
+ dy -= dy2;
+ state->transformDelta(dx, dy, &w1, &h1);
+
+ // check the tiny chars limit
+ if (!globalParams->getTextKeepTinyChars() &&
+ fabs(w1) < 3 && fabs(h1) < 3) {
+ if (++nTinyChars > 20000) {
+ return;
+ }
+ }
+
+ // break words at space character
+ if (uLen == 1 && u[0] == (Unicode)0x20) {
+ ++curWord->charLen;
+ ++charPos;
+ endWord();
+ return;
+ }
+
+ // large char spacing is sometimes used to move text around -- in
+ // this case, break text into individual chars and let the coalesce
+ // function deal with it later
+ n = curWord->len;
+ if (n > 0 && x1 - curWord->xRight[n-1] >
+ curWord->font->minSpaceWidth * curWord->fontSize) {
+ endWord();
+ beginWord(state, x, y);
+ }
+
+ // page rotation and/or transform matrices can cause text to be
+ // drawn in reverse order -- in this case, swap the begin/end
+ // coordinates and break text into individual chars
+ if (w1 < 0) {
+ endWord();
+ beginWord(state, x + dx, y + dy);
+ x1 += w1;
+ y1 += h1;
+ w1 = -w1;
+ h1 = -h1;
+ }
+
+ // add the characters to the current word
+ if (uLen != 0) {
+ w1 /= uLen;
+ h1 /= uLen;
+ }
+ for (i = 0; i < uLen; ++i) {
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
+ }
+ ++curWord->charLen;
+ ++charPos;
+}
+
+void TextPage::endWord() {
+ // This check is needed because Type 3 characters can contain
+ // text-drawing operations (when TextPage is being used via
+ // XOutputDev rather than TextOutputDev).
+ if (nest > 0) {
+ --nest;
+ return;
+ }
+
+ if (curWord) {
+ addWord(curWord);
+ curWord = NULL;
+ }
+}
+
+void TextPage::addWord(TextWord *word) {
+ TextWord *p1, *p2;
+
+ // throw away zero-length words -- they don't have valid xMin/xMax
+ // values, and they're useless anyway
+ if (word->len == 0) {
+ delete word;
+ return;
+ }
+
+ // insert word in xy list
+ if (rawOrder) {
+ p1 = wordPtr;
+ p2 = NULL;
+ } else {
+ if (wordPtr && wordPtr->xyBefore(word)) {
+ p1 = wordPtr;
+ p2 = wordPtr->next;
+ } else {
+ p1 = NULL;
+ p2 = words;
+ }
+ for (; p2; p1 = p2, p2 = p2->next) {
+ if (word->xyBefore(p2)) {
+ break;
+ }
+ }
+ }
+ if (p1) {
+ p1->next = word;
+ } else {
+ words = word;
+ }
+ word->next = p2;
+ wordPtr = word;
+}
+
+void TextPage::coalesce(GBool physLayout) {
+ TextWord *word0, *word1, *word2;
+ TextLine *line0, *line1, *line2, *line3, *line4, *lineList;
+ TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6;
+ TextBlock *yxBlocks, *blocks, *blkStack;
+ TextFlow *flow0, *flow1;
+ double sz, xLimit, yLimit;
+ double fit1, fit2, sp1, sp2;
+ GBool found;
+ UnicodeMap *uMap;
+ GBool isUnicode;
+ char buf[8];
+ int col1, col2, d, i, j;
+
+#if 0 // for debugging
+ printf("*** initial word list ***\n");
+ for (word0 = words; word0; word0 = word0->next) {
+ printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- discard duplicated text (fake boldface, drop shadows)
+
+ word0 = words;
+ while (word0) {
+ sz = word0->fontSize;
+ xLimit = word0->xMin + sz * dupMaxDeltaX;
+ found = gFalse;
+ for (word1 = word0, word2 = word0->next;
+ word2 && word2->xMin < xLimit;
+ word1 = word2, word2 = word2->next) {
+ if (word2->len == word0->len &&
+ !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) &&
+ fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY &&
+ fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY &&
+ fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) {
+ found = gTrue;
+ break;
+ }
+ }
+ if (found) {
+ word1->next = word2->next;
+ delete word2;
+ } else {
+ word0 = word0->next;
+ }
+ }
+
+#if 0 // for debugging
+ printf("*** words after removing duplicate text ***\n");
+ for (word0 = words; word0; word0 = word0->next) {
+ printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- merge words
+
+ word0 = words;
+ while (word0) {
+ sz = word0->fontSize;
+
+ // look for adjacent text which is part of the same word, and
+ // merge it into this word
+ xLimit = word0->xMax + sz * word0->font->minSpaceWidth;
+ if (rawOrder) {
+ word1 = word0;
+ word2 = word0->next;
+ found = word2 &&
+ word2->xMin < xLimit &&
+ word2->font == word0->font &&
+ fabs(word2->fontSize - sz) < 0.05 &&
+ fabs(word2->yBase - word0->yBase) < 0.05 &&
+ word2->charPos == word0->charPos + word0->charLen;
+ } else {
+ found = gFalse;
+ for (word1 = word0, word2 = word0->next;
+ word2 && word2->xMin < xLimit;
+ word1 = word2, word2 = word2->next) {
+ if (word2->font == word0->font &&
+ fabs(word2->fontSize - sz) < 0.05 &&
+ fabs(word2->yBase - word0->yBase) < 0.05 &&
+ word2->charPos == word0->charPos + word0->charLen) {
+ found = gTrue;
+ break;
+ }
+ }
+ }
+ if (found) {
+ word0->merge(word2);
+ word1->next = word2->next;
+ delete word2;
+ continue;
+ }
+
+ word0 = word0->next;
+ }
+
+#if 0 // for debugging
+ printf("*** after merging words ***\n");
+ for (word0 = words; word0; word0 = word0->next) {
+ printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- assemble words into lines
+
+ lineList = line0 = NULL;
+ while (words) {
+
+ // remove the first word from the word list
+ word0 = words;
+ words = words->next;
+ word0->next = NULL;
+
+ // find the best line (if any) for the word
+ if (rawOrder) {
+ if (line0 && lineFit(line0, word0, &sp2) >= 0) {
+ line1 = line0;
+ sp1 = sp2;
+ } else {
+ line1 = NULL;
+ sp1 = 0;
+ }
+ } else {
+ line1 = NULL;
+ fit1 = 0;
+ sp1 = 0;
+ for (line2 = lineList; line2; line2 = line2->next) {
+ fit2 = lineFit(line2, word0, &sp2);
+ if (fit2 >= 0 && (!line1 || fit2 < fit1)) {
+ line1 = line2;
+ fit1 = fit2;
+ sp1 = sp2;
+ }
+ }
+ }
+
+ // found a line: append the word
+ if (line1) {
+ word1 = line1->lastWord;
+ word1->next = word0;
+ line1->lastWord = word0;
+ if (word0->xMax > line1->xMax) {
+ line1->xMax = word0->xMax;
+ }
+ if (word0->yMin < line1->yMin) {
+ line1->yMin = word0->yMin;
+ }
+ if (word0->yMax > line1->yMax) {
+ line1->yMax = word0->yMax;
+ }
+ line1->len += word0->len;
+ if (sp1 > line1->fontSize * line1->font->minSpaceWidth) {
+ word1->spaceAfter = gTrue;
+ ++line1->len;
+ }
+
+ // didn't find a line: create a new line
+ } else {
+ line1 = new TextLine();
+ line1->words = line1->lastWord = word0;
+ line1->xMin = word0->xMin;
+ line1->xMax = word0->xMax;
+ line1->yMin = word0->yMin;
+ line1->yMax = word0->yMax;
+ line1->yBase = word0->yBase;
+ line1->font = word0->font;
+ line1->fontSize = word0->fontSize;
+ line1->len = word0->len;
+ if (line0) {
+ line0->next = line1;
+ } else {
+ lineList = line1;
+ }
+ line0 = line1;
+ }
+ }
+
+ // build the line text
+ uMap = globalParams->getTextEncoding();
+ isUnicode = uMap ? uMap->isUnicode() : gFalse;
+
+ for (line1 = lineList; line1; line1 = line1->next) {
+ line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode));
+ line1->xRight = (double *)gmalloc(line1->len * sizeof(double));
+ line1->col = (int *)gmalloc(line1->len * sizeof(int));
+ i = 0;
+ for (word1 = line1->words; word1; word1 = word1->next) {
+ for (j = 0; j < word1->len; ++j) {
+ line1->text[i] = word1->text[j];
+ line1->xRight[i] = word1->xRight[j];
+ ++i;
+ }
+ if (word1->spaceAfter && word1->next) {
+ line1->text[i] = (Unicode)0x0020;
+ line1->xRight[i] = word1->next->xMin;
+ ++i;
+ }
+ }
+ line1->convertedLen = 0;
+ for (j = 0; j < line1->len; ++j) {
+ line1->col[j] = line1->convertedLen;
+ if (isUnicode) {
+ ++line1->convertedLen;
+ } else if (uMap) {
+ line1->convertedLen +=
+ uMap->mapUnicode(line1->text[j], buf, sizeof(buf));
+ }
+ }
+
+ // check for hyphen at end of line
+ //~ need to check for other chars used as hyphens
+ if (line1->text[line1->len - 1] == (Unicode)'-') {
+ line1->hyphenated = gTrue;
+ }
+
+ }
+
+ if (uMap) {
+ uMap->decRefCnt();
+ }
+
+#if 0 // for debugging
+ printf("*** lines in xy order ***\n");
+ for (line0 = lineList; line0; line0 = line0->next) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->fontSize, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- column assignment
+
+ for (line1 = lineList; line1; line1 = line1->next) {
+ col1 = 0;
+ for (line2 = lineList; line2 != line1; line2 = line2->next) {
+ if (line1->xMin >= line2->xMax) {
+ d = (int)((line1->xMin - line2->xMax) /
+ (line1->font->maxSpaceWidth * line1->fontSize));
+ if (d > 4) {
+ d = 4;
+ }
+ col2 = line2->col[0] + line2->convertedLen + d;
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ } else if (line1->xMin > line2->xMin) {
+ for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ;
+ col2 = line2->col[i];
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ }
+ }
+ for (j = 0; j < line1->len; ++j) {
+ line1->col[j] += col1;
+ }
+ }
+
+#if 0 // for debugging
+ printf("*** lines in xy order, after column assignment ***\n");
+ for (line0 = lineList; line0; line0 = line0->next) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f col=%d len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->col[0], line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->fontSize, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- assemble lines into blocks
+
+ if (rawOrder) {
+
+ lines = lineList;
+ for (line1 = lines; line1; line1 = line1->next) {
+ line1->xSpaceL = 0;
+ line1->xSpaceR = pageWidth;
+ }
+
+ } else {
+
+ // sort lines into yx order
+ lines = NULL;
+ while (lineList) {
+ line0 = lineList;
+ lineList = lineList->next;
+ for (line1 = NULL, line2 = lines;
+ line2 && !line0->yxBefore(line2);
+ line1 = line2, line2 = line2->next) ;
+ if (line1) {
+ line1->next = line0;
+ } else {
+ lines = line0;
+ }
+ line0->next = line2;
+ }
+
+ // compute whitespace to left and right of each line
+ line0 = lines;
+ for (line1 = lines; line1; line1 = line1->next) {
+
+ // find the first vertically overlapping line
+ for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ;
+
+ // check each vertically overlapping line -- look for the nearest
+ // on each side
+ line1->xSpaceL = 0;
+ line1->xSpaceR = pageWidth;
+ for (line2 = line0;
+ line2 && line2->yMin < line1->yMax;
+ line2 = line2->next) {
+ if (line2->yMax > line1->yMin) {
+ if (line2->xMax < line1->xMin) {
+ if (line2->xMax > line1->xSpaceL) {
+ line1->xSpaceL = line2->xMax;
+ }
+ } else if (line2->xMin > line1->xMax) {
+ if (line2->xMin < line1->xSpaceR) {
+ line1->xSpaceR = line2->xMin;
+ }
+ }
+ }
+ }
+ }
+ } // (!rawOrder)
+
+#if 0 // for debugging
+ printf("*** lines in yx order ***\n");
+ for (line0 = lines; line0; line0 = line0->next) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->fontSize, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ lineList = lines;
+ yxBlocks = NULL;
+ blk0 = NULL;
+ while (lineList) {
+
+ // build a new block object
+ line0 = lineList;
+ lineList = lineList->next;
+ line0->next = NULL;
+ blk1 = new TextBlock();
+ blk1->lines = line0;
+ blk1->xMin = line0->xMin;
+ blk1->xMax = line0->xMax;
+ blk1->yMin = line0->yMin;
+ blk1->yMax = line0->yMax;
+ blk1->xSpaceL = line0->xSpaceL;
+ blk1->xSpaceR = line0->xSpaceR;
+ blk1->maxFontSize = line0->fontSize;
+
+ // find subsequent lines in the block
+ while (lineList) {
+
+ // look for the first horizontally overlapping line below this
+ // one
+ yLimit = line0->yMax + blkMaxSpacing * line0->fontSize;
+ line3 = line4 = NULL;
+ if (rawOrder) {
+ if (lineList->yMin < yLimit &&
+ lineList->xMax > blk1->xMin &&
+ lineList->xMin < blk1->xMax) {
+ line3 = NULL;
+ line4 = lineList;
+ }
+ } else {
+ for (line1 = NULL, line2 = lineList;
+ line2 && line2->yMin < yLimit;
+ line1 = line2, line2 = line2->next) {
+ if (line2->xMax > blk1->xMin &&
+ line2->xMin < blk1->xMax) {
+ line3 = line1;
+ line4 = line2;
+ break;
+ }
+ }
+ }
+
+ // if there is an overlapping line and it fits in the block, add
+ // it to the block
+ if (line4 && blockFit(blk1, line4)) {
+ if (line3) {
+ line3->next = line4->next;
+ } else {
+ lineList = line4->next;
+ }
+ line0->next = line0->flowNext = line4;
+ line4->next = NULL;
+ if (line4->xMin < blk1->xMin) {
+ blk1->xMin = line4->xMin;
+ } else if (line4->xMax > blk1->xMax) {
+ blk1->xMax = line4->xMax;
+ }
+ if (line4->yMax > blk1->yMax) {
+ blk1->yMax = line4->yMax;
+ }
+ if (line4->xSpaceL > blk1->xSpaceL) {
+ blk1->xSpaceL = line4->xSpaceL;
+ }
+ if (line4->xSpaceR < blk1->xSpaceR) {
+ blk1->xSpaceR = line4->xSpaceR;
+ }
+ if (line4->fontSize > blk1->maxFontSize) {
+ blk1->maxFontSize = line4->fontSize;
+ }
+ line0 = line4;
+
+ // otherwise, we're done with this block
+ } else {
+ break;
+ }
+ }
+
+ // insert block on list, in yx order
+ if (rawOrder) {
+ blk2 = blk0;
+ blk3 = NULL;
+ blk0 = blk1;
+ } else {
+ for (blk2 = NULL, blk3 = yxBlocks;
+ blk3 && !blk1->yxBefore(blk3);
+ blk2 = blk3, blk3 = blk3->next) ;
+ }
+ blk1->next = blk3;
+ if (blk2) {
+ blk2->next = blk1;
+ } else {
+ yxBlocks = blk1;
+ }
+ }
+
+#if 0 // for debugging
+ printf("*** blocks in yx order ***\n");
+ for (blk0 = yxBlocks; blk0; blk0 = blk0->next) {
+ printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
+ blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
+ for (line0 = blk0->lines; line0; line0 = line0->next) {
+ printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- merge lines and blocks, sort blocks into reading order