X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=blobdiff_plain;f=pdf%2Fxpdf%2FTextOutputDev.cc;h=3fcc9ec294ecdbd1e7800c98764c0928e04ca6c8;hb=884f739665dc56e66f51e104350f2affd33f2dd8;hp=5e5761fe04c4076e571ae2e2e32cbeb1662603f4;hpb=2a393c134fe3fe8eb85bf818cb7ad6ae4396322a;p=evince.git diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc index 5e5761fe..3fcc9ec2 100644 --- a/pdf/xpdf/TextOutputDev.cc +++ b/pdf/xpdf/TextOutputDev.cc @@ -2,22 +2,28 @@ // // TextOutputDev.cc // -// Copyright 1997-2002 Glyph & Cog, LLC +// Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== -#ifdef __GNUC__ +#include + +#ifdef USE_GCC_PRAGMAS #pragma implementation #endif -#include #include #include #include #include #include -#include "GString.h" +#ifdef WIN32 +#include // for O_BINARY +#include // for setmode +#endif #include "gmem.h" +#include "GString.h" +#include "GList.h" #include "config.h" #include "Error.h" #include "GlobalParams.h" @@ -31,17 +37,154 @@ #endif //------------------------------------------------------------------------ -// TextString +// parameters +//------------------------------------------------------------------------ + +// Minium and maximum inter-word spacing (as a fraction of the average +// character width). +#define wordMinSpaceWidth 0.3 +#define wordMaxSpaceWidth 2.0 + +// Default min and max inter-word spacing (when the average character +// width is unknown). +#define wordDefMinSpaceWidth 0.2 +#define wordDefMaxSpaceWidth 1.5 + +// Max difference in x,y coordinates (as a fraction of the font size) +// allowed for duplicated text (fake boldface, drop shadows) which is +// to be discarded. +#define dupMaxDeltaX 0.1 +#define dupMaxDeltaY 0.2 + +// Min overlap (as a fraction of the font size) required for two +// lines to be considered vertically overlapping. +#define lineOverlapSlack 0.5 + +// Max difference in baseline y coordinates (as a fraction of the font +// size) allowed for words which are to be grouped into a line, not +// including sub/superscripts. +#define lineMaxBaselineDelta 0.1 + +// Max ratio of font sizes allowed for words which are to be grouped +// into a line, not including sub/superscripts. +#define lineMaxFontSizeRatio 1.4 + +// Min spacing (as a fraction of the font size) allowed between words +// which are to be grouped into a line. +#define lineMinDeltaX -0.5 + +// Minimum vertical overlap (as a fraction of the font size) required +// for superscript and subscript words. +#define lineMinSuperscriptOverlap 0.3 +#define lineMinSubscriptOverlap 0.3 + +// Min/max ratio of font sizes allowed for sub/superscripts compared to +// the base text. +#define lineMinSubscriptFontSizeRatio 0.4 +#define lineMaxSubscriptFontSizeRatio 1.01 +#define lineMinSuperscriptFontSizeRatio 0.4 +#define lineMaxSuperscriptFontSizeRatio 1.01 + +// Max horizontal spacing (as a fraction of the font size) allowed +// before sub/superscripts. +#define lineMaxSubscriptDeltaX 0.2 +#define lineMaxSuperscriptDeltaX 0.2 + +// Maximum vertical spacing (as a fraction of the font size) allowed +// for lines which are to be grouped into a block. +#define blkMaxSpacing 2.0 + +// Max ratio of primary font sizes allowed for lines which are to be +// grouped into a block. +#define blkMaxFontSizeRatio 1.3 + +// Min overlap (as a fraction of the font size) required for two +// blocks to be considered vertically overlapping. +#define blkOverlapSlack 0.5 + +// Max vertical spacing (as a fraction of the font size) allowed +// between blocks which are 'adjacent' when sorted by reading order. +#define blkMaxSortSpacing 2.0 + +// Max vertical offset (as a fraction of the font size) of the top and +// bottom edges allowed for blocks which are to be grouped into a +// flow. +#define flowMaxDeltaY 1.0 + +//------------------------------------------------------------------------ +// TextFontInfo +//------------------------------------------------------------------------ + +TextFontInfo::TextFontInfo(GfxState *state) { + double *textMat; + double t1, t2, avgWidth, w; + int n, i; + + gfxFont = state->getFont(); + textMat = state->getTextMat(); + horizScaling = state->getHorizScaling(); + if ((t1 = fabs(textMat[0])) > 0.01 && + (t2 = fabs(textMat[3])) > 0.01) { + horizScaling *= t1 / t2; + } + + minSpaceWidth = horizScaling * wordDefMinSpaceWidth; + maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth; + if (gfxFont && gfxFont->isCIDFont()) { + //~ handle 16-bit fonts + } else if (gfxFont && gfxFont->getType() != fontType3) { + avgWidth = 0; + n = 0; + for (i = 0; i < 256; ++i) { + w = ((Gfx8BitFont *)gfxFont)->getWidth(i); + if (w > 0) { + avgWidth += w; + ++n; + } + } + if (n > 0) { + avgWidth /= n; + minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth; + maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth; + } + } + +} + +TextFontInfo::~TextFontInfo() { +} + +GBool TextFontInfo::matches(GfxState *state) { + double *textMat; + double t1, t2, h; + + textMat = state->getTextMat(); + h = state->getHorizScaling(); + if ((t1 = fabs(textMat[0])) > 0.01 && + (t2 = fabs(textMat[3])) > 0.01) { + h *= t1 / t2; + } + return state->getFont() == gfxFont && + fabs(h - horizScaling) < 0.01; +} + +//------------------------------------------------------------------------ +// TextWord //------------------------------------------------------------------------ -TextString::TextString(GfxState *state, double fontSize) { - GfxFont *font; +TextWord::TextWord(GfxState *state, double x0, double y0, int charPosA, + TextFontInfo *fontA, double fontSizeA) { + GfxFont *gfxFont; double x, y; - state->transform(state->getCurX(), state->getCurY(), &x, &y); - if ((font = state->getFont())) { - yMin = y - font->getAscent() * fontSize; - yMax = y - font->getDescent() * fontSize; + charPos = charPosA; + charLen = 0; + font = fontA; + fontSize = fontSizeA; + state->transform(x0, y0, &x, &y); + if ((gfxFont = font->gfxFont)) { + yMin = y - gfxFont->getAscent() * fontSize; + yMax = y - gfxFont->getDescent() * fontSize; } else { // this means that the PDF file draws text without a current font, // which should never happen @@ -54,21 +197,23 @@ TextString::TextString(GfxState *state, double fontSize) { yMin = y; yMax = y + 1; } - col = 0; + yBase = y; text = NULL; xRight = NULL; len = size = 0; - yxNext = NULL; - xyNext = NULL; + spaceAfter = gFalse; + next = NULL; + } -TextString::~TextString() { + +TextWord::~TextWord() { gfree(text); gfree(xRight); } -void TextString::addChar(GfxState *state, double x, double y, - double dx, double dy, Unicode u) { +void TextWord::addChar(GfxState *state, double x, double y, + double dx, double dy, Unicode u) { if (len == size) { size += 16; text = (Unicode *)grealloc(text, size * sizeof(Unicode)); @@ -82,249 +227,1460 @@ void TextString::addChar(GfxState *state, double x, double y, ++len; } +// Returns true if comes before in xy order. +GBool TextWord::xyBefore(TextWord *word2) { + return xMin < word2->xMin || + (xMin == word2->xMin && yMin < word2->yMin); +} + +// Merge another word onto the end of this one. +void TextWord::merge(TextWord *word2) { + int i; + + xMax = word2->xMax; + if (word2->yMin < yMin) { + yMin = word2->yMin; + } + if (word2->yMax > yMax) { + yMax = word2->yMax; + } + if (len + word2->len > size) { + size = len + word2->len; + text = (Unicode *)grealloc(text, size * sizeof(Unicode)); + xRight = (double *)grealloc(xRight, size * sizeof(double)); + } + for (i = 0; i < word2->len; ++i) { + text[len + i] = word2->text[i]; + xRight[len + i] = word2->xRight[i]; + } + len += word2->len; + charLen += word2->charLen; +} + +//------------------------------------------------------------------------ +// TextLine +//------------------------------------------------------------------------ + +TextLine::TextLine() { + words = NULL; + text = NULL; + xRight = NULL; + col = NULL; + len = 0; + hyphenated = gFalse; + pageNext = NULL; + next = NULL; + flowNext = NULL; +} + +TextLine::~TextLine() { + TextWord *w1, *w2; + + for (w1 = words; w1; w1 = w2) { + w2 = w1->next; + delete w1; + } + gfree(text); + gfree(xRight); + gfree(col); +} + +// Returns true if comes before in yx order, allowing +// slack for vertically overlapping lines. +GBool TextLine::yxBefore(TextLine *line2) { + double dy; + + dy = lineOverlapSlack * fontSize; + + // non-overlapping case + if (line2->yMin > yMax - dy || + line2->yMax < yMin + dy) { + return yMin < line2->yMin || + (yMin == line2->yMin && xMin < line2->xMin); + } + + // overlapping case + return xMin < line2->xMin; +} + +// Merge another line's words onto the end of this line. +void TextLine::merge(TextLine *line2) { + int newLen, i; + + xMax = line2->xMax; + if (line2->yMin < yMin) { + yMin = line2->yMin; + } + if (line2->yMax > yMax) { + yMax = line2->yMax; + } + xSpaceR = line2->xSpaceR; + lastWord->spaceAfter = gTrue; + lastWord->next = line2->words; + lastWord = line2->lastWord; + line2->words = NULL; + newLen = len + 1 + line2->len; + text = (Unicode *)grealloc(text, newLen * sizeof(Unicode)); + xRight = (double *)grealloc(xRight, newLen * sizeof(double)); + text[len] = (Unicode)0x0020; + xRight[len] = line2->xMin; + for (i = 0; i < line2->len; ++i) { + text[len + 1 + i] = line2->text[i]; + xRight[len + 1 + i] = line2->xRight[i]; + } + len = newLen; + convertedLen += line2->convertedLen; + hyphenated = line2->hyphenated; +} + +//------------------------------------------------------------------------ +// TextBlock +//------------------------------------------------------------------------ + +TextBlock::TextBlock() { + lines = NULL; + next = NULL; +} + +TextBlock::~TextBlock() { + TextLine *l1, *l2; + + for (l1 = lines; l1; l1 = l2) { + l2 = l1->next; + delete l1; + } +} + +// Returns true if comes before in xy order, allowing +// slack for vertically overlapping blocks. +GBool TextBlock::yxBefore(TextBlock *blk2) { + double dy; + + dy = blkOverlapSlack * lines->fontSize; + + // non-overlapping case + if (blk2->yMin > yMax - dy || + blk2->yMax < yMin + dy) { + return yMin < blk2->yMin || + (yMin == blk2->yMin && xMin < blk2->xMin); + } + + // overlapping case + return xMin < blk2->xMin; +} + +// Merge another block's line onto the right of this one. +void TextBlock::mergeRight(TextBlock *blk2) { + lines->merge(blk2->lines); + xMax = lines->xMax; + yMin = lines->yMin; + yMax = lines->yMax; + xSpaceR = lines->xSpaceR; +} + +// Merge another block's lines onto the bottom of this block. +void TextBlock::mergeBelow(TextBlock *blk2) { + TextLine *line; + + if (blk2->xMin < xMin) { + xMin = blk2->xMin; + } + if (blk2->xMax > xMax) { + xMax = blk2->xMax; + } + yMax = blk2->yMax; + if (blk2->xSpaceL > xSpaceL) { + xSpaceL = blk2->xSpaceL; + } + if (blk2->xSpaceR < xSpaceR) { + xSpaceR = blk2->xSpaceR; + } + if (blk2->maxFontSize > maxFontSize) { + maxFontSize = blk2->maxFontSize; + } + for (line = lines; line->next; line = line->next) ; + line->next = line->flowNext = blk2->lines; + blk2->lines = NULL; +} + +//------------------------------------------------------------------------ +// TextFlow +//------------------------------------------------------------------------ + +TextFlow::TextFlow() { + blocks = NULL; + next = NULL; +} + +TextFlow::~TextFlow() { + TextBlock *b1, *b2; + + for (b1 = blocks; b1; b1 = b2) { + b2 = b1->next; + delete b1; + } +} + + //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ TextPage::TextPage(GBool rawOrderA) { rawOrder = rawOrderA; - curStr = NULL; + curWord = NULL; + charPos = 0; + font = NULL; fontSize = 0; - yxStrings = NULL; - xyStrings = NULL; - yxCur1 = yxCur2 = NULL; nest = 0; + nTinyChars = 0; + words = wordPtr = NULL; + lines = NULL; + flows = NULL; + fonts = new GList(); } TextPage::~TextPage() { clear(); + delete fonts; } void TextPage::updateFont(GfxState *state) { - GfxFont *font; + GfxFont *gfxFont; double *fm; char *name; - int code; + int code, mCode, letterCode, anyCode; double w; + int i; + + // get the font info object + font = NULL; + for (i = 0; i < fonts->getLength(); ++i) { + font = (TextFontInfo *)fonts->get(i); + if (font->matches(state)) { + break; + } + font = NULL; + } + if (!font) { + font = new TextFontInfo(state); + fonts->append(font); + } // adjust the font size + gfxFont = state->getFont(); fontSize = state->getTransformedFontSize(); - if ((font = state->getFont()) && font->getType() == fontType3) { + if (gfxFont && gfxFont->getType() == fontType3) { // This is a hack which makes it possible to deal with some Type 3 // fonts. The problem is that it's impossible to know what the // base coordinate system used in the font is without actually // rendering the font. This code tries to guess by looking at the // width of the character 'm' (which breaks if the font is a // subset that doesn't contain 'm'). + mCode = letterCode = anyCode = -1; for (code = 0; code < 256; ++code) { - if ((name = ((Gfx8BitFont *)font)->getCharName(code)) && - name[0] == 'm' && name[1] == '\0') { - break; + name = ((Gfx8BitFont *)gfxFont)->getCharName(code); + if (name && name[0] == 'm' && name[1] == '\0') { + mCode = code; } - } - if (code < 256) { - w = ((Gfx8BitFont *)font)->getWidth(code); - if (w != 0) { - // 600 is a generic average 'm' width -- yes, this is a hack - fontSize *= w / 0.6; + if (letterCode < 0 && name && name[1] == '\0' && + ((name[0] >= 'A' && name[0] <= 'Z') || + (name[0] >= 'a' && name[0] <= 'z'))) { + letterCode = code; + } + if (anyCode < 0 && name && + ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) { + anyCode = code; } } - fm = font->getFontMatrix(); + if (mCode >= 0 && + (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) { + // 0.6 is a generic average 'm' width -- yes, this is a hack + fontSize *= w / 0.6; + } else if (letterCode >= 0 && + (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) { + // even more of a hack: 0.5 is a generic letter width + fontSize *= w / 0.5; + } else if (anyCode >= 0 && + (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) { + // better than nothing: 0.5 is a generic character width + fontSize *= w / 0.5; + } + fm = gfxFont->getFontMatrix(); if (fm[0] != 0) { fontSize *= fabs(fm[3] / fm[0]); } } } -void TextPage::beginString(GfxState *state) { +void TextPage::beginWord(GfxState *state, double x0, double y0) { // This check is needed because Type 3 characters can contain - // text-drawing operations. - if (curStr) { + // text-drawing operations (when TextPage is being used via + // XOutputDev rather than TextOutputDev). + if (curWord) { ++nest; return; } - curStr = new TextString(state, fontSize); + curWord = new TextWord(state, x0, y0, charPos, font, fontSize); } void TextPage::addChar(GfxState *state, double x, double y, - double dx, double dy, Unicode *u, int uLen) { - double x1, y1, w1, h1, dx2, dy2; + double dx, double dy, + CharCode c, Unicode *u, int uLen) { + double x1, y1, w1, h1, dx2, dy2, sp; int n, i; + // if the previous char was a space, addChar will have called + // endWord, so we need to start a new word + if (!curWord) { + beginWord(state, x, y); + } + + // throw away chars that aren't inside the page bounds state->transform(x, y, &x1, &y1); - n = curStr->len; - if (n > 0 && - x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) { - endString(); - beginString(state); - } - state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(), - 0, &dx2, &dy2); + if (x1 < 0 || x1 > pageWidth || + y1 < 0 || y1 > pageHeight) { + return; + } + + // subtract char and word spacing from the dx,dy values + sp = state->getCharSpace(); + if (c == (CharCode)0x20) { + sp += state->getWordSpace(); + } + state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); dx -= dx2; dy -= dy2; state->transformDelta(dx, dy, &w1, &h1); + + // check the tiny chars limit + if (!globalParams->getTextKeepTinyChars() && + fabs(w1) < 3 && fabs(h1) < 3) { + if (++nTinyChars > 20000) { + return; + } + } + + // break words at space character + if (uLen == 1 && u[0] == (Unicode)0x20) { + ++curWord->charLen; + ++charPos; + endWord(); + return; + } + + // large char spacing is sometimes used to move text around -- in + // this case, break text into individual chars and let the coalesce + // function deal with it later + n = curWord->len; + if (n > 0 && x1 - curWord->xRight[n-1] > + curWord->font->minSpaceWidth * curWord->fontSize) { + endWord(); + beginWord(state, x, y); + } + + // page rotation and/or transform matrices can cause text to be + // drawn in reverse order -- in this case, swap the begin/end + // coordinates and break text into individual chars + if (w1 < 0) { + endWord(); + beginWord(state, x + dx, y + dy); + x1 += w1; + y1 += h1; + w1 = -w1; + h1 = -h1; + } + + // add the characters to the current word if (uLen != 0) { w1 /= uLen; h1 /= uLen; } for (i = 0; i < uLen; ++i) { - curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); } + ++curWord->charLen; + ++charPos; } -void TextPage::endString() { - TextString *p1, *p2; - double h, y1, y2; - +void TextPage::endWord() { // This check is needed because Type 3 characters can contain - // text-drawing operations. + // text-drawing operations (when TextPage is being used via + // XOutputDev rather than TextOutputDev). if (nest > 0) { --nest; return; } - // throw away zero-length strings -- they don't have valid xMin/xMax + if (curWord) { + addWord(curWord); + curWord = NULL; + } +} + +void TextPage::addWord(TextWord *word) { + TextWord *p1, *p2; + + // throw away zero-length words -- they don't have valid xMin/xMax // values, and they're useless anyway - if (curStr->len == 0) { - delete curStr; - curStr = NULL; + if (word->len == 0) { + delete word; return; } - // insert string in y-major list - h = curStr->yMax - curStr->yMin; - y1 = curStr->yMin + 0.5 * h; - y2 = curStr->yMin + 0.8 * h; + // insert word in xy list if (rawOrder) { - p1 = yxCur1; + p1 = wordPtr; p2 = NULL; - } else if ((!yxCur1 || - (y1 >= yxCur1->yMin && - (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) && - (!yxCur2 || - (y1 < yxCur2->yMin || - (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) { - p1 = yxCur1; - p2 = yxCur2; } else { - for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) { - if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin)) { + if (wordPtr && wordPtr->xyBefore(word)) { + p1 = wordPtr; + p2 = wordPtr->next; + } else { + p1 = NULL; + p2 = words; + } + for (; p2; p1 = p2, p2 = p2->next) { + if (word->xyBefore(p2)) { break; } } - yxCur2 = p2; } - yxCur1 = curStr; if (p1) { - p1->yxNext = curStr; + p1->next = word; } else { - yxStrings = curStr; + words = word; } - curStr->yxNext = p2; - curStr = NULL; + word->next = p2; + wordPtr = word; } -void TextPage::coalesce() { - TextString *str1, *str2; - double space, d; - GBool addSpace; - int n, i; +void TextPage::coalesce(GBool physLayout) { + TextWord *word0, *word1, *word2; + TextLine *line0, *line1, *line2, *line3, *line4, *lineList; + TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6; + TextBlock *yxBlocks, *blocks, *blkStack; + TextFlow *flow0, *flow1; + double sz, xLimit, yLimit; + double fit1, fit2, sp1, sp2; + GBool found; + UnicodeMap *uMap; + GBool isUnicode; + char buf[8]; + int col1, col2, d, i, j; + +#if 0 // for debugging + printf("*** initial word list ***\n"); + for (word0 = words; word0; word0 = word0->next) { + printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + printf("\n"); + fflush(stdout); +#endif -#if 0 //~ for debugging - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - printf("x=%3d..%3d y=%3d..%3d size=%2d '", - (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax, - (int)(str1->yMax - str1->yMin)); - for (i = 0; i < str1->len; ++i) { - fputc(str1->text[i] & 0xff, stdout); + //----- discard duplicated text (fake boldface, drop shadows) + + word0 = words; + while (word0) { + sz = word0->fontSize; + xLimit = word0->xMin + sz * dupMaxDeltaX; + found = gFalse; + for (word1 = word0, word2 = word0->next; + word2 && word2->xMin < xLimit; + word1 = word2, word2 = word2->next) { + if (word2->len == word0->len && + !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) && + fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY && + fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY && + fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) { + found = gTrue; + break; + } + } + if (found) { + word1->next = word2->next; + delete word2; + } else { + word0 = word0->next; + } + } + +#if 0 // for debugging + printf("*** words after removing duplicate text ***\n"); + for (word0 = words; word0; word0 = word0->next) { + printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } - printf("\n------------------------------------------------------------\n\n"); + printf("\n"); + fflush(stdout); #endif - str1 = yxStrings; - while (str1 && (str2 = str1->yxNext)) { - space = str1->yMax - str1->yMin; - d = str2->xMin - str1->xMax; - if (((rawOrder && - ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) || - (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) || - (!rawOrder && str2->yMin < str1->yMax)) && - d > -0.5 * space && d < space) { - n = str1->len + str2->len; - if ((addSpace = d > 0.1 * space)) { - ++n; + + //----- merge words + + word0 = words; + while (word0) { + sz = word0->fontSize; + + // look for adjacent text which is part of the same word, and + // merge it into this word + xLimit = word0->xMax + sz * word0->font->minSpaceWidth; + if (rawOrder) { + word1 = word0; + word2 = word0->next; + found = word2 && + word2->xMin < xLimit && + word2->font == word0->font && + fabs(word2->fontSize - sz) < 0.05 && + fabs(word2->yBase - word0->yBase) < 0.05 && + word2->charPos == word0->charPos + word0->charLen; + } else { + found = gFalse; + for (word1 = word0, word2 = word0->next; + word2 && word2->xMin < xLimit; + word1 = word2, word2 = word2->next) { + if (word2->font == word0->font && + fabs(word2->fontSize - sz) < 0.05 && + fabs(word2->yBase - word0->yBase) < 0.05 && + word2->charPos == word0->charPos + word0->charLen) { + found = gTrue; + break; + } + } + } + if (found) { + word0->merge(word2); + word1->next = word2->next; + delete word2; + continue; + } + + word0 = word0->next; + } + +#if 0 // for debugging + printf("*** after merging words ***\n"); + for (word0 = words; word0; word0 = word0->next) { + printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + printf("\n"); + fflush(stdout); +#endif + + //----- assemble words into lines + + lineList = line0 = NULL; + while (words) { + + // remove the first word from the word list + word0 = words; + words = words->next; + word0->next = NULL; + + // find the best line (if any) for the word + if (rawOrder) { + if (line0 && lineFit(line0, word0, &sp2) >= 0) { + line1 = line0; + sp1 = sp2; + } else { + line1 = NULL; + sp1 = 0; + } + } else { + line1 = NULL; + fit1 = 0; + sp1 = 0; + for (line2 = lineList; line2; line2 = line2->next) { + fit2 = lineFit(line2, word0, &sp2); + if (fit2 >= 0 && (!line1 || fit2 < fit1)) { + line1 = line2; + fit1 = fit2; + sp1 = sp2; + } + } + } + + // found a line: append the word + if (line1) { + word1 = line1->lastWord; + word1->next = word0; + line1->lastWord = word0; + if (word0->xMax > line1->xMax) { + line1->xMax = word0->xMax; + } + if (word0->yMin < line1->yMin) { + line1->yMin = word0->yMin; + } + if (word0->yMax > line1->yMax) { + line1->yMax = word0->yMax; + } + line1->len += word0->len; + if (sp1 > line1->fontSize * line1->font->minSpaceWidth) { + word1->spaceAfter = gTrue; + ++line1->len; + } + + // didn't find a line: create a new line + } else { + line1 = new TextLine(); + line1->words = line1->lastWord = word0; + line1->xMin = word0->xMin; + line1->xMax = word0->xMax; + line1->yMin = word0->yMin; + line1->yMax = word0->yMax; + line1->yBase = word0->yBase; + line1->font = word0->font; + line1->fontSize = word0->fontSize; + line1->len = word0->len; + if (line0) { + line0->next = line1; + } else { + lineList = line1; + } + line0 = line1; + } + } + + // build the line text + uMap = globalParams->getTextEncoding(); + isUnicode = uMap ? uMap->isUnicode() : gFalse; + + for (line1 = lineList; line1; line1 = line1->next) { + line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode)); + line1->xRight = (double *)gmalloc(line1->len * sizeof(double)); + line1->col = (int *)gmalloc(line1->len * sizeof(int)); + i = 0; + for (word1 = line1->words; word1; word1 = word1->next) { + for (j = 0; j < word1->len; ++j) { + line1->text[i] = word1->text[j]; + line1->xRight[i] = word1->xRight[j]; + ++i; + } + if (word1->spaceAfter && word1->next) { + line1->text[i] = (Unicode)0x0020; + line1->xRight[i] = word1->next->xMin; + ++i; + } + } + line1->convertedLen = 0; + for (j = 0; j < line1->len; ++j) { + line1->col[j] = line1->convertedLen; + if (isUnicode) { + ++line1->convertedLen; + } else if (uMap) { + line1->convertedLen += + uMap->mapUnicode(line1->text[j], buf, sizeof(buf)); + } + } + + // check for hyphen at end of line + //~ need to check for other chars used as hyphens + if (line1->text[line1->len - 1] == (Unicode)'-') { + line1->hyphenated = gTrue; + } + + } + + if (uMap) { + uMap->decRefCnt(); + } + +#if 0 // for debugging + printf("*** lines in xy order ***\n"); + for (line0 = lineList; line0; line0 = line0->next) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + printf("\n"); + fflush(stdout); +#endif + + //----- column assignment + + for (line1 = lineList; line1; line1 = line1->next) { + col1 = 0; + for (line2 = lineList; line2 != line1; line2 = line2->next) { + if (line1->xMin >= line2->xMax) { + d = (int)((line1->xMin - line2->xMax) / + (line1->font->maxSpaceWidth * line1->fontSize)); + if (d > 4) { + d = 4; + } + col2 = line2->col[0] + line2->convertedLen + d; + if (col2 > col1) { + col1 = col2; + } + } else if (line1->xMin > line2->xMin) { + for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ; + col2 = line2->col[i]; + if (col2 > col1) { + col1 = col2; + } + } + } + for (j = 0; j < line1->len; ++j) { + line1->col[j] += col1; + } + } + +#if 0 // for debugging + printf("*** lines in xy order, after column assignment ***\n"); + for (line0 = lineList; line0; line0 = line0->next) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f col=%d len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->col[0], line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + printf("\n"); + fflush(stdout); +#endif + + //----- assemble lines into blocks + + if (rawOrder) { + + lines = lineList; + for (line1 = lines; line1; line1 = line1->next) { + line1->xSpaceL = 0; + line1->xSpaceR = pageWidth; + } + + } else { + + // sort lines into yx order + lines = NULL; + while (lineList) { + line0 = lineList; + lineList = lineList->next; + for (line1 = NULL, line2 = lines; + line2 && !line0->yxBefore(line2); + line1 = line2, line2 = line2->next) ; + if (line1) { + line1->next = line0; + } else { + lines = line0; + } + line0->next = line2; + } + + // compute whitespace to left and right of each line + line0 = lines; + for (line1 = lines; line1; line1 = line1->next) { + + // find the first vertically overlapping line + for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ; + + // check each vertically overlapping line -- look for the nearest + // on each side + line1->xSpaceL = 0; + line1->xSpaceR = pageWidth; + for (line2 = line0; + line2 && line2->yMin < line1->yMax; + line2 = line2->next) { + if (line2->yMax > line1->yMin) { + if (line2->xMax < line1->xMin) { + if (line2->xMax > line1->xSpaceL) { + line1->xSpaceL = line2->xMax; + } + } else if (line2->xMin > line1->xMax) { + if (line2->xMin < line1->xSpaceR) { + line1->xSpaceR = line2->xMin; + } + } + } + } + } + } // (!rawOrder) + +#if 0 // for debugging + printf("*** lines in yx order ***\n"); + for (line0 = lines; line0; line0 = line0->next) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + printf("\n"); + fflush(stdout); +#endif + + lineList = lines; + yxBlocks = NULL; + blk0 = NULL; + while (lineList) { + + // build a new block object + line0 = lineList; + lineList = lineList->next; + line0->next = NULL; + blk1 = new TextBlock(); + blk1->lines = line0; + blk1->xMin = line0->xMin; + blk1->xMax = line0->xMax; + blk1->yMin = line0->yMin; + blk1->yMax = line0->yMax; + blk1->xSpaceL = line0->xSpaceL; + blk1->xSpaceR = line0->xSpaceR; + blk1->maxFontSize = line0->fontSize; + + // find subsequent lines in the block + while (lineList) { + + // look for the first horizontally overlapping line below this + // one + yLimit = line0->yMax + blkMaxSpacing * line0->fontSize; + line3 = line4 = NULL; + if (rawOrder) { + if (lineList->yMin < yLimit && + lineList->xMax > blk1->xMin && + lineList->xMin < blk1->xMax) { + line3 = NULL; + line4 = lineList; + } + } else { + for (line1 = NULL, line2 = lineList; + line2 && line2->yMin < yLimit; + line1 = line2, line2 = line2->next) { + if (line2->xMax > blk1->xMin && + line2->xMin < blk1->xMax) { + line3 = line1; + line4 = line2; + break; + } + } + } + + // if there is an overlapping line and it fits in the block, add + // it to the block + if (line4 && blockFit(blk1, line4)) { + if (line3) { + line3->next = line4->next; + } else { + lineList = line4->next; + } + line0->next = line0->flowNext = line4; + line4->next = NULL; + if (line4->xMin < blk1->xMin) { + blk1->xMin = line4->xMin; + } else if (line4->xMax > blk1->xMax) { + blk1->xMax = line4->xMax; + } + if (line4->yMax > blk1->yMax) { + blk1->yMax = line4->yMax; + } + if (line4->xSpaceL > blk1->xSpaceL) { + blk1->xSpaceL = line4->xSpaceL; + } + if (line4->xSpaceR < blk1->xSpaceR) { + blk1->xSpaceR = line4->xSpaceR; + } + if (line4->fontSize > blk1->maxFontSize) { + blk1->maxFontSize = line4->fontSize; + } + line0 = line4; + + // otherwise, we're done with this block + } else { + break; } - str1->size = (n + 15) & ~15; - str1->text = (Unicode *)grealloc(str1->text, - str1->size * sizeof(Unicode)); - str1->xRight = (double *)grealloc(str1->xRight, - str1->size * sizeof(double)); - if (addSpace) { - str1->text[str1->len] = 0x20; - str1->xRight[str1->len] = str2->xMin; - ++str1->len; - } - for (i = 0; i < str2->len; ++i) { - str1->text[str1->len] = str2->text[i]; - str1->xRight[str1->len] = str2->xRight[i]; - ++str1->len; - } - if (str2->xMax > str1->xMax) { - str1->xMax = str2->xMax; - } - if (str2->yMax > str1->yMax) { - str1->yMax = str2->yMax; - } - str1->yxNext = str2->yxNext; - delete str2; + } + + // insert block on list, in yx order + if (rawOrder) { + blk2 = blk0; + blk3 = NULL; + blk0 = blk1; } else { - str1 = str2; + for (blk2 = NULL, blk3 = yxBlocks; + blk3 && !blk1->yxBefore(blk3); + blk2 = blk3, blk3 = blk3->next) ; + } + blk1->next = blk3; + if (blk2) { + blk2->next = blk1; + } else { + yxBlocks = blk1; + } + } + +#if 0 // for debugging + printf("*** blocks in yx order ***\n"); + for (blk0 = yxBlocks; blk0; blk0 = blk0->next) { + printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", + blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); + for (line0 = blk0->lines; line0; line0 = line0->next) { + printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + } + printf("\n"); + fflush(stdout); +#endif + + //----- merge lines and blocks, sort blocks into reading order + + if (rawOrder) { + blocks = yxBlocks; + + } else { + blocks = NULL; + blk0 = NULL; + blkStack = NULL; + while (yxBlocks) { + + // find the next two blocks: + // - if the depth-first traversal stack is empty, take the first + // (upper-left-most) two blocks on the yx-sorted block list + // - otherwise, find the two upper-left-most blocks under the top + // block on the stack + if (blkStack) { + blk3 = blk4 = blk5 = blk6 = NULL; + for (blk1 = NULL, blk2 = yxBlocks; + blk2; + blk1 = blk2, blk2 = blk2->next) { + if (blk2->yMin > blkStack->yMin && + blk2->xMax > blkStack->xMin && + blk2->xMin < blkStack->xMax) { + if (!blk4 || blk2->yxBefore(blk4)) { + blk5 = blk3; + blk6 = blk4; + blk3 = blk1; + blk4 = blk2; + } else if (!blk6 || blk2->yxBefore(blk6)) { + blk5 = blk1; + blk6 = blk2; + } + } + } + } else { + blk3 = NULL; + blk4 = yxBlocks; + blk5 = yxBlocks; + blk6 = yxBlocks->next; + } + + // merge case 1: + // | | | + // | blkStack | | blkStack + // +---------------------+ --> +-------------- + // +------+ +------+ +-----------+ + // | blk4 | | blk6 | ... | blk4+blk6 | + // +------+ +------+ +-----------+ + yLimit = 0; // make gcc happy + if (blkStack) { + yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize; + } + if (blkStack && blk4 && blk6 && + !blk4->lines->next && !blk6->lines->next && + lineFit2(blk4->lines, blk6->lines) && + blk4->yMin < yLimit && + blk4->xMin > blkStack->xSpaceL && + blkStack->xMin > blk4->xSpaceL && + blk6->xMax < blkStack->xSpaceR) { + blk4->mergeRight(blk6); + if (blk5) { + blk5->next = blk6->next; + } else { + yxBlocks = blk6->next; + } + delete blk6; + + // merge case 2: + // | | | | + // | blkStack | | | + // +---------------------+ --> | blkStack+blk2 | + // +---------------------+ | | + // | blk4 | | | + // | | | | + } else if (blkStack && blk4 && + blk4->yMin < yLimit && + blockFit2(blkStack, blk4)) { + blkStack->mergeBelow(blk4); + if (blk3) { + blk3->next = blk4->next; + } else { + yxBlocks = blk4->next; + } + delete blk4; + + // if any of: + // 1. no block found + // 2. non-fully overlapping block found + // 3. large vertical gap above the overlapping block + // then pop the stack and try again + } else if (!blk4 || + (blkStack && (blk4->xMin < blkStack->xSpaceL || + blk4->xMax > blkStack->xSpaceR || + blk4->yMin - blkStack->yMax > + blkMaxSortSpacing * blkStack->maxFontSize))) { + blkStack = blkStack->stackNext; + + // add a block to the sorted list + } else { + + // remove the block from the yx-sorted list + if (blk3) { + blk3->next = blk4->next; + } else { + yxBlocks = blk4->next; + } + blk4->next = NULL; + + // append the block to the reading-order list + if (blk0) { + blk0->next = blk4; + } else { + blocks = blk4; + } + blk0 = blk4; + + // push the block on the traversal stack + if (!physLayout) { + blk4->stackNext = blkStack; + blkStack = blk4; + } + } + } + } // (!rawOrder) + +#if 0 // for debugging + printf("*** blocks in reading order (after merging) ***\n"); + for (blk0 = blocks; blk0; blk0 = blk0->next) { + printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", + blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); + for (line0 = blk0->lines; line0; line0 = line0->next) { + printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + } + printf("\n"); + fflush(stdout); +#endif + + //----- assemble blocks into flows + + if (rawOrder) { + + // one flow per block + flow0 = NULL; + while (blocks) { + flow1 = new TextFlow(); + flow1->blocks = blocks; + flow1->lines = blocks->lines; + flow1->yMin = blocks->yMin; + flow1->yMax = blocks->yMax; + blocks = blocks->next; + flow1->blocks->next = NULL; + if (flow0) { + flow0->next = flow1; + } else { + flows = flow1; + } + flow0 = flow1; + } + + } else { + + // compute whitespace above and below each block + for (blk0 = blocks; blk0; blk0 = blk0->next) { + blk0->ySpaceT = 0; + blk0->ySpaceB = pageHeight; + + // check each horizontally overlapping block + for (blk1 = blocks; blk1; blk1 = blk1->next) { + if (blk1 != blk0 && + blk1->xMin < blk0->xMax && + blk1->xMax > blk0->xMin) { + if (blk1->yMax < blk0->yMin) { + if (blk1->yMax > blk0->ySpaceT) { + blk0->ySpaceT = blk1->yMax; + } + } else if (blk1->yMin > blk0->yMax) { + if (blk1->yMin < blk0->ySpaceB) { + blk0->ySpaceB = blk1->yMin; + } + } + } + } + } + + flow0 = NULL; + while (blocks) { + + // build a new flow object + flow1 = new TextFlow(); + flow1->blocks = blocks; + flow1->lines = blocks->lines; + flow1->yMin = blocks->yMin; + flow1->yMax = blocks->yMax; + flow1->ySpaceT = blocks->ySpaceT; + flow1->ySpaceB = blocks->ySpaceB; + + // find subsequent blocks in the flow + for (blk1 = blocks, blk2 = blocks->next; + blk2 && flowFit(flow1, blk2); + blk1 = blk2, blk2 = blk2->next) { + if (blk2->yMin < flow1->yMin) { + flow1->yMin = blk2->yMin; + } + if (blk2->yMax > flow1->yMax) { + flow1->yMax = blk2->yMax; + } + if (blk2->ySpaceT > flow1->ySpaceT) { + flow1->ySpaceT = blk2->ySpaceT; + } + if (blk2->ySpaceB < flow1->ySpaceB) { + flow1->ySpaceB = blk2->ySpaceB; + } + for (line1 = blk1->lines; line1->next; line1 = line1->next) ; + line1->flowNext = blk2->lines; + } + + // chop the block list + blocks = blk1->next; + blk1->next = NULL; + + // append the flow to the list + if (flow0) { + flow0->next = flow1; + } else { + flows = flow1; + } + flow0 = flow1; } } + +#if 0 // for debugging + printf("*** flows ***\n"); + for (flow0 = flows; flow0; flow0 = flow0->next) { + printf("[flow]\n"); + for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) { + printf(" [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n", + blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax, + blk0->ySpaceT, blk0->ySpaceB); + for (line0 = blk0->lines; line0; line0 = line0->next) { + printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + } + } + printf("\n"); + fflush(stdout); +#endif + + //----- sort lines into yx order + + // (the block/line merging process doesn't maintain the full-page + // linked list of lines) + + lines = NULL; + if (rawOrder) { + line0 = NULL; + for (flow0 = flows; flow0; flow0 = flow0->next) { + for (line1 = flow0->lines; line1; line1 = line1->flowNext) { + if (line0) { + line0->pageNext = line1; + } else { + lines = line1; + } + line0 = line1; + } + } + } else { + for (flow0 = flows; flow0; flow0 = flow0->next) { + for (line0 = flow0->lines; line0; line0 = line0->flowNext) { + for (line1 = NULL, line2 = lines; + line2 && !line0->yxBefore(line2); + line1 = line2, line2 = line2->pageNext) ; + if (line1) { + line1->pageNext = line0; + } else { + lines = line0; + } + line0->pageNext = line2; + } + } + } + +#if 0 // for debugging + printf("*** lines in yx order ***\n"); + for (line0 = lines; line0; line0 = line0->pageNext) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0], + line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + printf("\n"); + fflush(stdout); +#endif +} + +// If can be added the end of , return the absolute value +// of the difference between 's baseline and 's baseline, +// and set * to the horizontal space between the current last +// word in and . A smaller return value indicates a +// better fit. Otherwise, return a negative number. +double TextPage::lineFit(TextLine *line, TextWord *word, double *space) { + TextWord *lastWord; + double fontSize0, fontSize1; + double dx, dxLimit; + + lastWord = line->lastWord; + fontSize0 = line->fontSize; + fontSize1 = word->fontSize; + dx = word->xMin - lastWord->xMax; + dxLimit = fontSize0 * lastWord->font->maxSpaceWidth; + + // check inter-word spacing + if (dx < fontSize0 * lineMinDeltaX || + dx > dxLimit) { + return -1; + } + + if ( + // look for adjacent words with close baselines and close font sizes + (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 && + fontSize0 < lineMaxFontSizeRatio * fontSize1 && + fontSize1 < lineMaxFontSizeRatio * fontSize0) || + + // look for a superscript + (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 && + fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 && + (word->yMax < lastWord->yMax || + word->yBase < lastWord->yBase) && + word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 && + dx < fontSize0 * lineMaxSuperscriptDeltaX) || + + // look for a subscript + (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 && + fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 && + (word->yMin > lastWord->yMin || + word->yBase > lastWord->yBase) && + line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 && + dx < fontSize0 * lineMaxSubscriptDeltaX)) { + + *space = dx; + return fabs(word->yBase - line->yBase); + } + + return -1; } +// Returns true if and can be merged into a single +// line, ignoring max word spacing. +GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) { + double fontSize0, fontSize1; + double dx; + + fontSize0 = line0->fontSize; + fontSize1 = line1->fontSize; + dx = line1->xMin - line0->xMax; + + // check inter-word spacing + if (dx < fontSize0 * lineMinDeltaX) { + return gFalse; + } + + // look for close baselines and close font sizes + if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 && + fontSize0 < lineMaxFontSizeRatio * fontSize1 && + fontSize1 < lineMaxFontSizeRatio * fontSize0) { + return gTrue; + } + + return gFalse; +} + +// Returns true if can be added to . Assumes the y +// coordinates are within range. +GBool TextPage::blockFit(TextBlock *blk, TextLine *line) { + double fontSize0, fontSize1; + + // check edges + if (line->xMin < blk->xSpaceL || + line->xMax > blk->xSpaceR || + blk->xMin < line->xSpaceL || + blk->xMax > line->xSpaceR) { + return gFalse; + } + + // check font sizes + fontSize0 = blk->lines->fontSize; + fontSize1 = line->fontSize; + if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || + fontSize1 > blkMaxFontSizeRatio * fontSize0) { + return gFalse; + } + + return gTrue; +} + +// Returns true if and can be merged into a single +// block. Assumes the y coordinates are within range. +GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) { + double fontSize0, fontSize1; + + // check edges + if (blk1->xMin < blk0->xSpaceL || + blk1->xMax > blk0->xSpaceR || + blk0->xMin < blk1->xSpaceL || + blk0->xMax > blk1->xSpaceR) { + return gFalse; + } + + // check font sizes + fontSize0 = blk0->lines->fontSize; + fontSize1 = blk1->lines->fontSize; + if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || + fontSize1 > blkMaxFontSizeRatio * fontSize0) { + return gFalse; + } + + return gTrue; +} + +// Returns true if can be added to . +GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) { + double dy; + + // check whitespace above and below + if (blk->yMin < flow->ySpaceT || + blk->yMax > flow->ySpaceB || + flow->yMin < blk->ySpaceT || + flow->yMax > blk->ySpaceB) { + return gFalse; + } + + // check that block top edge is within +/- dy of flow top edge, + // and that block bottom edge is above flow bottom edge + dy + dy = flowMaxDeltaY * flow->blocks->maxFontSize; + return blk->yMin > flow->yMin - dy && + blk->yMin < flow->yMin + dy && + blk->yMax < flow->yMax + dy; +} + + GBool TextPage::findText(Unicode *s, int len, GBool top, GBool bottom, double *xMin, double *yMin, double *xMax, double *yMax) { - TextString *str; + TextLine *line; Unicode *p; Unicode u1, u2; int m, i, j; - double x; + double x0, x1, x; - // scan all strings on page - for (str = yxStrings; str; str = str->yxNext) { + // scan all text on the page + for (line = lines; line; line = line->pageNext) { // check: above top limit? - if (!top && (str->yMax < *yMin || - (str->yMin < *yMin && str->xMax <= *xMin))) { + if (!top && (line->yMax < *yMin || + (line->yMin < *yMin && line->xMax <= *xMin))) { continue; } // check: below bottom limit? - if (!bottom && (str->yMin > *yMax || - (str->yMax > *yMax && str->xMin >= *xMax))) { + if (!bottom && (line->yMin > *yMax || + (line->yMax > *yMax && line->xMin >= *xMax))) { return gFalse; } - // search each position in this string - m = str->len; - for (i = 0, p = str->text; i <= m - len; ++i, ++p) { + // search each position in this line + m = line->len; + for (i = 0, p = line->text; i <= m - len; ++i, ++p) { + + x0 = (i == 0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + x = 0.5 * (x0 + x1); // check: above top limit? - if (!top && str->yMin < *yMin) { - x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; + if (!top && line->yMin < *yMin) { if (x < *xMin) { continue; } } // check: below bottom limit? - if (!bottom && str->yMax > *yMax) { - x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; + if (!bottom && line->yMax > *yMax) { if (x > *xMax) { return gFalse; } @@ -352,14 +1708,15 @@ GBool TextPage::findText(Unicode *s, int len, // found it if (j == len) { - *xMin = (i == 0) ? str->xMin : str->xRight[i-1]; - *xMax = str->xRight[i + len - 1]; - *yMin = str->yMin; - *yMax = str->yMax; + *xMin = x0; + *xMax = line->xRight[i + len - 1]; + *yMin = line->yMin; + *yMax = line->yMax; return gTrue; } } } + return gFalse; } @@ -367,18 +1724,21 @@ GString *TextPage::getText(double xMin, double yMin, double xMax, double yMax) { GString *s; UnicodeMap *uMap; + GBool isUnicode; char space[8], eol[16], buf[8]; - int spaceLen, eolLen, n; - TextString *str1; - double x0, x1, x2, y; - double xPrev, yPrev; - int i1, i2, i; + int spaceLen, eolLen, len; + TextLine *line, *prevLine; + double x0, x1, y; + int firstCol, col, i; GBool multiLine; s = new GString(); + + // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { return s; } + isUnicode = uMap->isUnicode(); spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); eolLen = 0; // make gcc happy switch (globalParams->getTextEOL()) { @@ -393,61 +1753,170 @@ GString *TextPage::getText(double xMin, double yMin, eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); break; } - xPrev = yPrev = 0; - multiLine = gFalse; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - y = 0.5 * (str1->yMin + str1->yMax); - if (y > yMax) { + + // find the leftmost column + firstCol = -1; + for (line = lines; line; line = line->pageNext) { + if (line->yMin > yMax) { break; } - if (y > yMin && str1->xMin < xMax && str1->xMax > xMin) { - x0 = x1 = x2 = str1->xMin; - for (i1 = 0; i1 < str1->len; ++i1) { - x0 = (i1==0) ? str1->xMin : str1->xRight[i1-1]; - x1 = str1->xRight[i1]; - if (0.5 * (x0 + x1) >= xMin) { - break; - } - } - for (i2 = str1->len - 1; i2 > i1; --i2) { - x1 = (i2==0) ? str1->xMin : str1->xRight[i2-1]; - x2 = str1->xRight[i2]; - if (0.5 * (x1 + x2) <= xMax) { - break; - } + if (line->yMax < yMin || + line->xMax < xMin || + line->xMin > xMax) { + continue; + } + + y = 0.5 * (line->yMin + line->yMax); + if (y < yMin || y > yMax) { + continue; + } + + i = 0; + while (i < line->len) { + x0 = (i==0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + if (0.5 * (x0 + x1) > xMin) { + break; } - if (s->getLength() > 0) { - if (x0 < xPrev || str1->yMin > yPrev) { - s->append(eol, eolLen); - multiLine = gTrue; - } else { - for (i = 0; i < 4; ++i) { - s->append(space, spaceLen); - } - } + ++i; + } + if (i == line->len) { + continue; + } + col = line->col[i]; + + if (firstCol < 0 || col < firstCol) { + firstCol = col; + } + } + + // extract the text + col = firstCol; + multiLine = gFalse; + prevLine = NULL; + for (line = lines; line; line = line->pageNext) { + if (line->yMin > yMax) { + break; + } + if (line->yMax < yMin || + line->xMax < xMin || + line->xMin > xMax) { + continue; + } + + y = 0.5 * (line->yMin + line->yMax); + if (y < yMin || y > yMax) { + continue; + } + + i = 0; + while (i < line->len) { + x0 = (i==0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + if (0.5 * (x0 + x1) > xMin) { + break; } - for (i = i1; i <= i2; ++i) { - n = uMap->mapUnicode(str1->text[i], buf, sizeof(buf)); - s->append(buf, n); + ++i; + } + if (i == line->len) { + continue; + } + + // insert a return + if (line->col[i] < col || + (prevLine && + line->yMin > + prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) { + s->append(eol, eolLen); + col = firstCol; + multiLine = gTrue; + } + prevLine = line; + + // line this block up with the correct column + for (; col < line->col[i]; ++col) { + s->append(space, spaceLen); + } + + // print the portion of the line + for (; i < line->len; ++i) { + + x0 = (i==0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + if (0.5 * (x0 + x1) > xMax) { + break; } - xPrev = x2; - yPrev = str1->yMax; + + len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); + s->append(buf, len); + col += isUnicode ? 1 : len; } } + if (multiLine) { s->append(eol, eolLen); } + uMap->decRefCnt(); + return s; } -void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) { +GBool TextPage::findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax) { + TextLine *line; + TextWord *word; + double x; + GBool first; + int i; + + //~ this doesn't correctly handle: + //~ - ranges split across multiple lines (the highlighted region + //~ is the bounding box of all the parts of the range) + //~ - cases where characters don't convert one-to-one into Unicode + first = gTrue; + for (line = lines; line; line = line->pageNext) { + for (word = line->words; word; word = word->next) { + if (pos < word->charPos + word->charLen && + word->charPos < pos + length) { + i = pos - word->charPos; + if (i < 0) { + i = 0; + } + x = (i == 0) ? word->xMin : word->xRight[i - 1]; + if (first || x < *xMin) { + *xMin = x; + } + i = pos + length - word->charPos; + if (i >= word->len) { + i = word->len - 1; + } + x = word->xRight[i]; + if (first || x > *xMax) { + *xMax = x; + } + if (first || word->yMin < *yMin) { + *yMin = word->yMin; + } + if (first || word->yMax > *yMax) { + *yMax = word->yMax; + } + first = gFalse; + } + } + } + return !first; +} + +void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, + GBool physLayout) { UnicodeMap *uMap; char space[8], eol[16], eop[8], buf[8]; - int spaceLen, eolLen, eopLen, n; - TextString *str1, *str2, *str3; - double yMin, yMax; - int col1, col2, d, i; + int spaceLen, eolLen, eopLen, len; + TextFlow *flow; + TextLine *line; + int col, d, n, i; // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { @@ -469,156 +1938,127 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) { } eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop)); - // build x-major list - xyStrings = NULL; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - for (str2 = NULL, str3 = xyStrings; - str3; - str2 = str3, str3 = str3->xyNext) { - if (str1->xMin < str3->xMin || - (str1->xMin == str3->xMin && str1->yMin < str3->yMin)) { - break; - } - } - if (str2) { - str2->xyNext = str1; - } else { - xyStrings = str1; - } - str1->xyNext = str3; - } + // output the page, maintaining the original physical layout + if (physLayout || rawOrder) { + col = 0; + for (line = lines; line; line = line->pageNext) { - // do column assignment - for (str1 = xyStrings; str1; str1 = str1->xyNext) { - col1 = 0; - for (str2 = xyStrings; str2 != str1; str2 = str2->xyNext) { - if (str1->xMin >= str2->xMax) { - col2 = str2->col + str2->len + 4; - if (col2 > col1) { - col1 = col2; - } - } else if (str1->xMin > str2->xMin) { - col2 = str2->col + - (int)(((str1->xMin - str2->xMin) / (str2->xMax - str2->xMin)) * - str2->len); - if (col2 > col1) { - col1 = col2; + // line this block up with the correct column + if (!rawOrder) { + for (; col < line->col[0]; ++col) { + (*outputFunc)(outputStream, space, spaceLen); } } - } - str1->col = col1; - } - -#if 0 //~ for debugging - fprintf((FILE *)outputStream, "~~~~~~~~~~\n"); - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - fprintf((FILE *)outputStream, "(%4d,%4d) - (%4d,%4d) [%3d] '", - (int)str1->xMin, (int)str1->yMin, - (int)str1->xMax, (int)str1->yMax, str1->col); - for (i = 0; i < str1->len; ++i) { - fputc(str1->text[i] & 0xff, stdout); - } - printf("'\n"); - } - fprintf((FILE *)outputStream, "~~~~~~~~~~\n"); -#endif - - // output - col1 = 0; - yMax = yxStrings ? yxStrings->yMax : 0; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - - // line this string up with the correct column - if (rawOrder && col1 == 0) { - col1 = str1->col; - } else { - for (; col1 < str1->col; ++col1) { - (*outputFunc)(outputStream, space, spaceLen); - } - } - // print the string - for (i = 0; i < str1->len; ++i) { - if ((n = uMap->mapUnicode(str1->text[i], buf, sizeof(buf))) > 0) { - (*outputFunc)(outputStream, buf, n); + // print the line + for (i = 0; i < line->len; ++i) { + len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); + (*outputFunc)(outputStream, buf, len); } - } - - // increment column - col1 += str1->len; - - // update yMax for this line - if (str1->yMax > yMax) { - yMax = str1->yMax; - } - - // if we've hit the end of the line... - if (!(str1->yxNext && - !(rawOrder && str1->yxNext->yMax < str1->yMin) && - str1->yxNext->yMin < 0.2*str1->yMin + 0.8*str1->yMax && - str1->yxNext->xMin >= str1->xMax)) { - - // print a return - (*outputFunc)(outputStream, eol, eolLen); - - // print extra vertical space if necessary - if (str1->yxNext) { - - // find yMin for next line - yMin = str1->yxNext->yMin; - for (str2 = str1->yxNext; str2; str2 = str2->yxNext) { - if (str2->yMin < yMin) { - yMin = str2->yMin; - } - if (!(str2->yxNext && str2->yxNext->yMin < str2->yMax && - str2->yxNext->xMin >= str2->xMax)) - break; + col += line->convertedLen; + + // print one or more returns if necessary + if (rawOrder || + !line->pageNext || + line->pageNext->col[0] < col || + line->pageNext->yMin > + line->yMax - lineOverlapSlack * line->fontSize) { + + // compute number of returns + d = 1; + if (line->pageNext) { + d += (int)((line->pageNext->yMin - line->yMax) / + line->fontSize + 0.5); } - - // print the space - d = (int)((yMin - yMax) / (str1->yMax - str1->yMin) + 0.5); + // various things (weird font matrices) can result in bogus // values here, so do a sanity check - if (rawOrder && d > 2) { - d = 2; - } else if (!rawOrder && d > 5) { + if (d < 1) { + d = 1; + } else if (d > 5) { d = 5; } for (; d > 0; --d) { (*outputFunc)(outputStream, eol, eolLen); } + + col = 0; } + } - // set up for next line - col1 = 0; - yMax = str1->yxNext ? str1->yxNext->yMax : 0; + // output the page, "undoing" the layout + } else { + for (flow = flows; flow; flow = flow->next) { + for (line = flow->lines; line; line = line->flowNext) { + n = line->len; + if (line->flowNext && line->hyphenated) { + --n; + } + for (i = 0; i < n; ++i) { + len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); + (*outputFunc)(outputStream, buf, len); + } + if (line->flowNext && !line->hyphenated) { + (*outputFunc)(outputStream, space, spaceLen); + } + } + (*outputFunc)(outputStream, eol, eolLen); + (*outputFunc)(outputStream, eol, eolLen); } } // end of page - (*outputFunc)(outputStream, eol, eolLen); (*outputFunc)(outputStream, eop, eopLen); (*outputFunc)(outputStream, eol, eolLen); uMap->decRefCnt(); } +void TextPage::startPage(GfxState *state) { + clear(); + if (state) { + pageWidth = state->getPageWidth(); + pageHeight = state->getPageHeight(); + } else { + pageWidth = pageHeight = 0; + } +} + void TextPage::clear() { - TextString *p1, *p2; + TextWord *w1, *w2; + TextFlow *f1, *f2; - if (curStr) { - delete curStr; - curStr = NULL; + if (curWord) { + delete curWord; + curWord = NULL; } - for (p1 = yxStrings; p1; p1 = p2) { - p2 = p1->yxNext; - delete p1; + if (words) { + for (w1 = words; w1; w1 = w2) { + w2 = w1->next; + delete w1; + } + } else if (flows) { + for (f1 = flows; f1; f1 = f2) { + f2 = f1->next; + delete f1; + } } - yxStrings = NULL; - xyStrings = NULL; - yxCur1 = yxCur2 = NULL; + deleteGList(fonts, TextFontInfo); + + curWord = NULL; + charPos = 0; + font = NULL; + fontSize = 0; + nest = 0; + nTinyChars = 0; + words = wordPtr = NULL; + lines = NULL; + flows = NULL; + fonts = new GList(); + } + //------------------------------------------------------------------------ // TextOutputDev //------------------------------------------------------------------------ @@ -627,8 +2067,10 @@ static void outputToFile(void *stream, char *text, int len) { fwrite(text, 1, len, (FILE *)stream); } -TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) { +TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA, + GBool rawOrderA, GBool append) { text = NULL; + physLayout = physLayoutA; rawOrder = rawOrderA; ok = gTrue; @@ -637,6 +2079,10 @@ TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) { if (fileName) { if (!strcmp(fileName, "-")) { outputStream = stdout; +#ifdef WIN32 + // keep DOS from munging the end-of-line characters + setmode(fileno(stdout), O_BINARY); +#endif } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) { needClose = gTrue; } else { @@ -650,16 +2096,17 @@ TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) { } // set up text object - text = new TextPage(rawOrder); + text = new TextPage(rawOrderA); } TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, - GBool rawOrderA) { + GBool physLayoutA, GBool rawOrderA) { outputFunc = func; outputStream = stream; needClose = gFalse; + physLayout = physLayoutA; rawOrder = rawOrderA; - text = new TextPage(rawOrder); + text = new TextPage(rawOrderA); ok = gTrue; } @@ -676,13 +2123,13 @@ TextOutputDev::~TextOutputDev() { } void TextOutputDev::startPage(int pageNum, GfxState *state) { - text->clear(); + text->startPage(state); } void TextOutputDev::endPage() { - text->coalesce(); + text->coalesce(physLayout); if (outputStream) { - text->dump(outputStream, outputFunc); + text->dump(outputStream, outputFunc, physLayout); } } @@ -691,18 +2138,18 @@ void TextOutputDev::updateFont(GfxState *state) { } void TextOutputDev::beginString(GfxState *state, GString *s) { - text->beginString(state); + text->beginWord(state, state->getCurX(), state->getCurY()); } void TextOutputDev::endString(GfxState *state) { - text->endString(); + text->endWord(); } void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, Unicode *u, int uLen) { - text->addChar(state, x, y, dx, dy, u, uLen); + text->addChar(state, x, y, dx, dy, c, u, uLen); } GBool TextOutputDev::findText(Unicode *s, int len, @@ -711,3 +2158,16 @@ GBool TextOutputDev::findText(Unicode *s, int len, double *xMax, double *yMax) { return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax); } + +GString *TextOutputDev::getText(double xMin, double yMin, + double xMax, double yMax) { + return text->getText(xMin, yMin, xMax, yMax); +} + +GBool TextOutputDev::findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax) { + return text->findCharRange(pos, length, xMin, yMin, xMax, yMax); +} + +