X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=blobdiff_plain;f=pdf%2Fxpdf%2FTextOutputDev.cc;h=3fcc9ec294ecdbd1e7800c98764c0928e04ca6c8;hb=884f739665dc56e66f51e104350f2affd33f2dd8;hp=6d675ef40663e9b9c3eea1da46fae506f10577fc;hpb=50e9d31c05e9ca11ad43cc570556094782c1b956;p=evince.git diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc index 6d675ef4..3fcc9ec2 100644 --- a/pdf/xpdf/TextOutputDev.cc +++ b/pdf/xpdf/TextOutputDev.cc @@ -2,920 +2,2172 @@ // // TextOutputDev.cc // -// Copyright 1997 Derek B. Noonburg +// Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== -#ifdef __GNUC__ +#include + +#ifdef USE_GCC_PRAGMAS #pragma implementation #endif #include #include #include +#include #include -#include "GString.h" +#ifdef WIN32 +#include // for O_BINARY +#include // for setmode +#endif #include "gmem.h" +#include "GString.h" +#include "GList.h" #include "config.h" #include "Error.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" #include "GfxState.h" -#include "FontEncoding.h" #include "TextOutputDev.h" -#include "TextOutputFontInfo.h" +#ifdef MACOS +// needed for setting type/creator of MacOS files +#include "ICSupport.h" +#endif + +//------------------------------------------------------------------------ +// parameters +//------------------------------------------------------------------------ + +// Minium and maximum inter-word spacing (as a fraction of the average +// character width). +#define wordMinSpaceWidth 0.3 +#define wordMaxSpaceWidth 2.0 + +// Default min and max inter-word spacing (when the average character +// width is unknown). +#define wordDefMinSpaceWidth 0.2 +#define wordDefMaxSpaceWidth 1.5 + +// Max difference in x,y coordinates (as a fraction of the font size) +// allowed for duplicated text (fake boldface, drop shadows) which is +// to be discarded. +#define dupMaxDeltaX 0.1 +#define dupMaxDeltaY 0.2 + +// Min overlap (as a fraction of the font size) required for two +// lines to be considered vertically overlapping. +#define lineOverlapSlack 0.5 + +// Max difference in baseline y coordinates (as a fraction of the font +// size) allowed for words which are to be grouped into a line, not +// including sub/superscripts. +#define lineMaxBaselineDelta 0.1 + +// Max ratio of font sizes allowed for words which are to be grouped +// into a line, not including sub/superscripts. +#define lineMaxFontSizeRatio 1.4 + +// Min spacing (as a fraction of the font size) allowed between words +// which are to be grouped into a line. +#define lineMinDeltaX -0.5 + +// Minimum vertical overlap (as a fraction of the font size) required +// for superscript and subscript words. +#define lineMinSuperscriptOverlap 0.3 +#define lineMinSubscriptOverlap 0.3 + +// Min/max ratio of font sizes allowed for sub/superscripts compared to +// the base text. +#define lineMinSubscriptFontSizeRatio 0.4 +#define lineMaxSubscriptFontSizeRatio 1.01 +#define lineMinSuperscriptFontSizeRatio 0.4 +#define lineMaxSuperscriptFontSizeRatio 1.01 + +// Max horizontal spacing (as a fraction of the font size) allowed +// before sub/superscripts. +#define lineMaxSubscriptDeltaX 0.2 +#define lineMaxSuperscriptDeltaX 0.2 + +// Maximum vertical spacing (as a fraction of the font size) allowed +// for lines which are to be grouped into a block. +#define blkMaxSpacing 2.0 + +// Max ratio of primary font sizes allowed for lines which are to be +// grouped into a block. +#define blkMaxFontSizeRatio 1.3 + +// Min overlap (as a fraction of the font size) required for two +// blocks to be considered vertically overlapping. +#define blkOverlapSlack 0.5 + +// Max vertical spacing (as a fraction of the font size) allowed +// between blocks which are 'adjacent' when sorted by reading order. +#define blkMaxSortSpacing 2.0 + +// Max vertical offset (as a fraction of the font size) of the top and +// bottom edges allowed for blocks which are to be grouped into a +// flow. +#define flowMaxDeltaY 1.0 + +//------------------------------------------------------------------------ +// TextFontInfo +//------------------------------------------------------------------------ + +TextFontInfo::TextFontInfo(GfxState *state) { + double *textMat; + double t1, t2, avgWidth, w; + int n, i; + + gfxFont = state->getFont(); + textMat = state->getTextMat(); + horizScaling = state->getHorizScaling(); + if ((t1 = fabs(textMat[0])) > 0.01 && + (t2 = fabs(textMat[3])) > 0.01) { + horizScaling *= t1 / t2; + } + + minSpaceWidth = horizScaling * wordDefMinSpaceWidth; + maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth; + if (gfxFont && gfxFont->isCIDFont()) { + //~ handle 16-bit fonts + } else if (gfxFont && gfxFont->getType() != fontType3) { + avgWidth = 0; + n = 0; + for (i = 0; i < 256; ++i) { + w = ((Gfx8BitFont *)gfxFont)->getWidth(i); + if (w > 0) { + avgWidth += w; + ++n; + } + } + if (n > 0) { + avgWidth /= n; + minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth; + maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth; + } + } + +} + +TextFontInfo::~TextFontInfo() { +} + +GBool TextFontInfo::matches(GfxState *state) { + double *textMat; + double t1, t2, h; + + textMat = state->getTextMat(); + h = state->getHorizScaling(); + if ((t1 = fabs(textMat[0])) > 0.01 && + (t2 = fabs(textMat[3])) > 0.01) { + h *= t1 / t2; + } + return state->getFont() == gfxFont && + fabs(h - horizScaling) < 0.01; +} + +//------------------------------------------------------------------------ +// TextWord +//------------------------------------------------------------------------ + +TextWord::TextWord(GfxState *state, double x0, double y0, int charPosA, + TextFontInfo *fontA, double fontSizeA) { + GfxFont *gfxFont; + double x, y; + + charPos = charPosA; + charLen = 0; + font = fontA; + fontSize = fontSizeA; + state->transform(x0, y0, &x, &y); + if ((gfxFont = font->gfxFont)) { + yMin = y - gfxFont->getAscent() * fontSize; + yMax = y - gfxFont->getDescent() * fontSize; + } else { + // this means that the PDF file draws text without a current font, + // which should never happen + yMin = y - 0.95 * fontSize; + yMax = y + 0.35 * fontSize; + } + if (yMin == yMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + yMin = y; + yMax = y + 1; + } + yBase = y; + text = NULL; + xRight = NULL; + len = size = 0; + spaceAfter = gFalse; + next = NULL; + +} + + +TextWord::~TextWord() { + gfree(text); + gfree(xRight); +} + +void TextWord::addChar(GfxState *state, double x, double y, + double dx, double dy, Unicode u) { + if (len == size) { + size += 16; + text = (Unicode *)grealloc(text, size * sizeof(Unicode)); + xRight = (double *)grealloc(xRight, size * sizeof(double)); + } + text[len] = u; + if (len == 0) { + xMin = x; + } + xMax = xRight[len] = x + dx; + ++len; +} + +// Returns true if comes before in xy order. +GBool TextWord::xyBefore(TextWord *word2) { + return xMin < word2->xMin || + (xMin == word2->xMin && yMin < word2->yMin); +} + +// Merge another word onto the end of this one. +void TextWord::merge(TextWord *word2) { + int i; + + xMax = word2->xMax; + if (word2->yMin < yMin) { + yMin = word2->yMin; + } + if (word2->yMax > yMax) { + yMax = word2->yMax; + } + if (len + word2->len > size) { + size = len + word2->len; + text = (Unicode *)grealloc(text, size * sizeof(Unicode)); + xRight = (double *)grealloc(xRight, size * sizeof(double)); + } + for (i = 0; i < word2->len; ++i) { + text[len + i] = word2->text[i]; + xRight[len + i] = word2->xRight[i]; + } + len += word2->len; + charLen += word2->charLen; +} + +//------------------------------------------------------------------------ +// TextLine +//------------------------------------------------------------------------ + +TextLine::TextLine() { + words = NULL; + text = NULL; + xRight = NULL; + col = NULL; + len = 0; + hyphenated = gFalse; + pageNext = NULL; + next = NULL; + flowNext = NULL; +} + +TextLine::~TextLine() { + TextWord *w1, *w2; + + for (w1 = words; w1; w1 = w2) { + w2 = w1->next; + delete w1; + } + gfree(text); + gfree(xRight); + gfree(col); +} + +// Returns true if comes before in yx order, allowing +// slack for vertically overlapping lines. +GBool TextLine::yxBefore(TextLine *line2) { + double dy; + + dy = lineOverlapSlack * fontSize; + + // non-overlapping case + if (line2->yMin > yMax - dy || + line2->yMax < yMin + dy) { + return yMin < line2->yMin || + (yMin == line2->yMin && xMin < line2->xMin); + } + + // overlapping case + return xMin < line2->xMin; +} + +// Merge another line's words onto the end of this line. +void TextLine::merge(TextLine *line2) { + int newLen, i; + + xMax = line2->xMax; + if (line2->yMin < yMin) { + yMin = line2->yMin; + } + if (line2->yMax > yMax) { + yMax = line2->yMax; + } + xSpaceR = line2->xSpaceR; + lastWord->spaceAfter = gTrue; + lastWord->next = line2->words; + lastWord = line2->lastWord; + line2->words = NULL; + newLen = len + 1 + line2->len; + text = (Unicode *)grealloc(text, newLen * sizeof(Unicode)); + xRight = (double *)grealloc(xRight, newLen * sizeof(double)); + text[len] = (Unicode)0x0020; + xRight[len] = line2->xMin; + for (i = 0; i < line2->len; ++i) { + text[len + 1 + i] = line2->text[i]; + xRight[len + 1 + i] = line2->xRight[i]; + } + len = newLen; + convertedLen += line2->convertedLen; + hyphenated = line2->hyphenated; +} + +//------------------------------------------------------------------------ +// TextBlock +//------------------------------------------------------------------------ + +TextBlock::TextBlock() { + lines = NULL; + next = NULL; +} + +TextBlock::~TextBlock() { + TextLine *l1, *l2; + + for (l1 = lines; l1; l1 = l2) { + l2 = l1->next; + delete l1; + } +} + +// Returns true if comes before in xy order, allowing +// slack for vertically overlapping blocks. +GBool TextBlock::yxBefore(TextBlock *blk2) { + double dy; + + dy = blkOverlapSlack * lines->fontSize; + + // non-overlapping case + if (blk2->yMin > yMax - dy || + blk2->yMax < yMin + dy) { + return yMin < blk2->yMin || + (yMin == blk2->yMin && xMin < blk2->xMin); + } + + // overlapping case + return xMin < blk2->xMin; +} + +// Merge another block's line onto the right of this one. +void TextBlock::mergeRight(TextBlock *blk2) { + lines->merge(blk2->lines); + xMax = lines->xMax; + yMin = lines->yMin; + yMax = lines->yMax; + xSpaceR = lines->xSpaceR; +} + +// Merge another block's lines onto the bottom of this block. +void TextBlock::mergeBelow(TextBlock *blk2) { + TextLine *line; + + if (blk2->xMin < xMin) { + xMin = blk2->xMin; + } + if (blk2->xMax > xMax) { + xMax = blk2->xMax; + } + yMax = blk2->yMax; + if (blk2->xSpaceL > xSpaceL) { + xSpaceL = blk2->xSpaceL; + } + if (blk2->xSpaceR < xSpaceR) { + xSpaceR = blk2->xSpaceR; + } + if (blk2->maxFontSize > maxFontSize) { + maxFontSize = blk2->maxFontSize; + } + for (line = lines; line->next; line = line->next) ; + line->next = line->flowNext = blk2->lines; + blk2->lines = NULL; +} + +//------------------------------------------------------------------------ +// TextFlow +//------------------------------------------------------------------------ + +TextFlow::TextFlow() { + blocks = NULL; + next = NULL; +} + +TextFlow::~TextFlow() { + TextBlock *b1, *b2; + + for (b1 = blocks; b1; b1 = b2) { + b2 = b1->next; + delete b1; + } +} + + +//------------------------------------------------------------------------ +// TextPage +//------------------------------------------------------------------------ + +TextPage::TextPage(GBool rawOrderA) { + rawOrder = rawOrderA; + curWord = NULL; + charPos = 0; + font = NULL; + fontSize = 0; + nest = 0; + nTinyChars = 0; + words = wordPtr = NULL; + lines = NULL; + flows = NULL; + fonts = new GList(); +} + +TextPage::~TextPage() { + clear(); + delete fonts; +} + +void TextPage::updateFont(GfxState *state) { + GfxFont *gfxFont; + double *fm; + char *name; + int code, mCode, letterCode, anyCode; + double w; + int i; + + // get the font info object + font = NULL; + for (i = 0; i < fonts->getLength(); ++i) { + font = (TextFontInfo *)fonts->get(i); + if (font->matches(state)) { + break; + } + font = NULL; + } + if (!font) { + font = new TextFontInfo(state); + fonts->append(font); + } + + // adjust the font size + gfxFont = state->getFont(); + fontSize = state->getTransformedFontSize(); + if (gfxFont && gfxFont->getType() == fontType3) { + // This is a hack which makes it possible to deal with some Type 3 + // fonts. The problem is that it's impossible to know what the + // base coordinate system used in the font is without actually + // rendering the font. This code tries to guess by looking at the + // width of the character 'm' (which breaks if the font is a + // subset that doesn't contain 'm'). + mCode = letterCode = anyCode = -1; + for (code = 0; code < 256; ++code) { + name = ((Gfx8BitFont *)gfxFont)->getCharName(code); + if (name && name[0] == 'm' && name[1] == '\0') { + mCode = code; + } + if (letterCode < 0 && name && name[1] == '\0' && + ((name[0] >= 'A' && name[0] <= 'Z') || + (name[0] >= 'a' && name[0] <= 'z'))) { + letterCode = code; + } + if (anyCode < 0 && name && + ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) { + anyCode = code; + } + } + if (mCode >= 0 && + (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) { + // 0.6 is a generic average 'm' width -- yes, this is a hack + fontSize *= w / 0.6; + } else if (letterCode >= 0 && + (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) { + // even more of a hack: 0.5 is a generic letter width + fontSize *= w / 0.5; + } else if (anyCode >= 0 && + (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) { + // better than nothing: 0.5 is a generic character width + fontSize *= w / 0.5; + } + fm = gfxFont->getFontMatrix(); + if (fm[0] != 0) { + fontSize *= fabs(fm[3] / fm[0]); + } + } +} + +void TextPage::beginWord(GfxState *state, double x0, double y0) { + // This check is needed because Type 3 characters can contain + // text-drawing operations (when TextPage is being used via + // XOutputDev rather than TextOutputDev). + if (curWord) { + ++nest; + return; + } + + curWord = new TextWord(state, x0, y0, charPos, font, fontSize); +} + +void TextPage::addChar(GfxState *state, double x, double y, + double dx, double dy, + CharCode c, Unicode *u, int uLen) { + double x1, y1, w1, h1, dx2, dy2, sp; + int n, i; + + // if the previous char was a space, addChar will have called + // endWord, so we need to start a new word + if (!curWord) { + beginWord(state, x, y); + } + + // throw away chars that aren't inside the page bounds + state->transform(x, y, &x1, &y1); + if (x1 < 0 || x1 > pageWidth || + y1 < 0 || y1 > pageHeight) { + return; + } + + // subtract char and word spacing from the dx,dy values + sp = state->getCharSpace(); + if (c == (CharCode)0x20) { + sp += state->getWordSpace(); + } + state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); + dx -= dx2; + dy -= dy2; + state->transformDelta(dx, dy, &w1, &h1); + + // check the tiny chars limit + if (!globalParams->getTextKeepTinyChars() && + fabs(w1) < 3 && fabs(h1) < 3) { + if (++nTinyChars > 20000) { + return; + } + } + + // break words at space character + if (uLen == 1 && u[0] == (Unicode)0x20) { + ++curWord->charLen; + ++charPos; + endWord(); + return; + } + + // large char spacing is sometimes used to move text around -- in + // this case, break text into individual chars and let the coalesce + // function deal with it later + n = curWord->len; + if (n > 0 && x1 - curWord->xRight[n-1] > + curWord->font->minSpaceWidth * curWord->fontSize) { + endWord(); + beginWord(state, x, y); + } + + // page rotation and/or transform matrices can cause text to be + // drawn in reverse order -- in this case, swap the begin/end + // coordinates and break text into individual chars + if (w1 < 0) { + endWord(); + beginWord(state, x + dx, y + dy); + x1 += w1; + y1 += h1; + w1 = -w1; + h1 = -h1; + } + + // add the characters to the current word + if (uLen != 0) { + w1 /= uLen; + h1 /= uLen; + } + for (i = 0; i < uLen; ++i) { + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); + } + ++curWord->charLen; + ++charPos; +} + +void TextPage::endWord() { + // This check is needed because Type 3 characters can contain + // text-drawing operations (when TextPage is being used via + // XOutputDev rather than TextOutputDev). + if (nest > 0) { + --nest; + return; + } + + if (curWord) { + addWord(curWord); + curWord = NULL; + } +} + +void TextPage::addWord(TextWord *word) { + TextWord *p1, *p2; + + // throw away zero-length words -- they don't have valid xMin/xMax + // values, and they're useless anyway + if (word->len == 0) { + delete word; + return; + } + + // insert word in xy list + if (rawOrder) { + p1 = wordPtr; + p2 = NULL; + } else { + if (wordPtr && wordPtr->xyBefore(word)) { + p1 = wordPtr; + p2 = wordPtr->next; + } else { + p1 = NULL; + p2 = words; + } + for (; p2; p1 = p2, p2 = p2->next) { + if (word->xyBefore(p2)) { + break; + } + } + } + if (p1) { + p1->next = word; + } else { + words = word; + } + word->next = p2; + wordPtr = word; +} + +void TextPage::coalesce(GBool physLayout) { + TextWord *word0, *word1, *word2; + TextLine *line0, *line1, *line2, *line3, *line4, *lineList; + TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6; + TextBlock *yxBlocks, *blocks, *blkStack; + TextFlow *flow0, *flow1; + double sz, xLimit, yLimit; + double fit1, fit2, sp1, sp2; + GBool found; + UnicodeMap *uMap; + GBool isUnicode; + char buf[8]; + int col1, col2, d, i, j; + +#if 0 // for debugging + printf("*** initial word list ***\n"); + for (word0 = words; word0; word0 = word0->next) { + printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + printf("\n"); + fflush(stdout); +#endif + + //----- discard duplicated text (fake boldface, drop shadows) + + word0 = words; + while (word0) { + sz = word0->fontSize; + xLimit = word0->xMin + sz * dupMaxDeltaX; + found = gFalse; + for (word1 = word0, word2 = word0->next; + word2 && word2->xMin < xLimit; + word1 = word2, word2 = word2->next) { + if (word2->len == word0->len && + !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) && + fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY && + fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY && + fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) { + found = gTrue; + break; + } + } + if (found) { + word1->next = word2->next; + delete word2; + } else { + word0 = word0->next; + } + } + +#if 0 // for debugging + printf("*** words after removing duplicate text ***\n"); + for (word0 = words; word0; word0 = word0->next) { + printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + printf("\n"); + fflush(stdout); +#endif + + //----- merge words + + word0 = words; + while (word0) { + sz = word0->fontSize; + + // look for adjacent text which is part of the same word, and + // merge it into this word + xLimit = word0->xMax + sz * word0->font->minSpaceWidth; + if (rawOrder) { + word1 = word0; + word2 = word0->next; + found = word2 && + word2->xMin < xLimit && + word2->font == word0->font && + fabs(word2->fontSize - sz) < 0.05 && + fabs(word2->yBase - word0->yBase) < 0.05 && + word2->charPos == word0->charPos + word0->charLen; + } else { + found = gFalse; + for (word1 = word0, word2 = word0->next; + word2 && word2->xMin < xLimit; + word1 = word2, word2 = word2->next) { + if (word2->font == word0->font && + fabs(word2->fontSize - sz) < 0.05 && + fabs(word2->yBase - word0->yBase) < 0.05 && + word2->charPos == word0->charPos + word0->charLen) { + found = gTrue; + break; + } + } + } + if (found) { + word0->merge(word2); + word1->next = word2->next; + delete word2; + continue; + } + + word0 = word0->next; + } + +#if 0 // for debugging + printf("*** after merging words ***\n"); + for (word0 = words; word0; word0 = word0->next) { + printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + printf("\n"); + fflush(stdout); +#endif + + //----- assemble words into lines + + lineList = line0 = NULL; + while (words) { + + // remove the first word from the word list + word0 = words; + words = words->next; + word0->next = NULL; + + // find the best line (if any) for the word + if (rawOrder) { + if (line0 && lineFit(line0, word0, &sp2) >= 0) { + line1 = line0; + sp1 = sp2; + } else { + line1 = NULL; + sp1 = 0; + } + } else { + line1 = NULL; + fit1 = 0; + sp1 = 0; + for (line2 = lineList; line2; line2 = line2->next) { + fit2 = lineFit(line2, word0, &sp2); + if (fit2 >= 0 && (!line1 || fit2 < fit1)) { + line1 = line2; + fit1 = fit2; + sp1 = sp2; + } + } + } + + // found a line: append the word + if (line1) { + word1 = line1->lastWord; + word1->next = word0; + line1->lastWord = word0; + if (word0->xMax > line1->xMax) { + line1->xMax = word0->xMax; + } + if (word0->yMin < line1->yMin) { + line1->yMin = word0->yMin; + } + if (word0->yMax > line1->yMax) { + line1->yMax = word0->yMax; + } + line1->len += word0->len; + if (sp1 > line1->fontSize * line1->font->minSpaceWidth) { + word1->spaceAfter = gTrue; + ++line1->len; + } + + // didn't find a line: create a new line + } else { + line1 = new TextLine(); + line1->words = line1->lastWord = word0; + line1->xMin = word0->xMin; + line1->xMax = word0->xMax; + line1->yMin = word0->yMin; + line1->yMax = word0->yMax; + line1->yBase = word0->yBase; + line1->font = word0->font; + line1->fontSize = word0->fontSize; + line1->len = word0->len; + if (line0) { + line0->next = line1; + } else { + lineList = line1; + } + line0 = line1; + } + } + + // build the line text + uMap = globalParams->getTextEncoding(); + isUnicode = uMap ? uMap->isUnicode() : gFalse; + + for (line1 = lineList; line1; line1 = line1->next) { + line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode)); + line1->xRight = (double *)gmalloc(line1->len * sizeof(double)); + line1->col = (int *)gmalloc(line1->len * sizeof(int)); + i = 0; + for (word1 = line1->words; word1; word1 = word1->next) { + for (j = 0; j < word1->len; ++j) { + line1->text[i] = word1->text[j]; + line1->xRight[i] = word1->xRight[j]; + ++i; + } + if (word1->spaceAfter && word1->next) { + line1->text[i] = (Unicode)0x0020; + line1->xRight[i] = word1->next->xMin; + ++i; + } + } + line1->convertedLen = 0; + for (j = 0; j < line1->len; ++j) { + line1->col[j] = line1->convertedLen; + if (isUnicode) { + ++line1->convertedLen; + } else if (uMap) { + line1->convertedLen += + uMap->mapUnicode(line1->text[j], buf, sizeof(buf)); + } + } + + // check for hyphen at end of line + //~ need to check for other chars used as hyphens + if (line1->text[line1->len - 1] == (Unicode)'-') { + line1->hyphenated = gTrue; + } + + } + + if (uMap) { + uMap->decRefCnt(); + } + +#if 0 // for debugging + printf("*** lines in xy order ***\n"); + for (line0 = lineList; line0; line0 = line0->next) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + printf("\n"); + fflush(stdout); +#endif + + //----- column assignment + + for (line1 = lineList; line1; line1 = line1->next) { + col1 = 0; + for (line2 = lineList; line2 != line1; line2 = line2->next) { + if (line1->xMin >= line2->xMax) { + d = (int)((line1->xMin - line2->xMax) / + (line1->font->maxSpaceWidth * line1->fontSize)); + if (d > 4) { + d = 4; + } + col2 = line2->col[0] + line2->convertedLen + d; + if (col2 > col1) { + col1 = col2; + } + } else if (line1->xMin > line2->xMin) { + for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ; + col2 = line2->col[i]; + if (col2 > col1) { + col1 = col2; + } + } + } + for (j = 0; j < line1->len; ++j) { + line1->col[j] += col1; + } + } + +#if 0 // for debugging + printf("*** lines in xy order, after column assignment ***\n"); + for (line0 = lineList; line0; line0 = line0->next) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f col=%d len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->col[0], line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + printf("\n"); + fflush(stdout); +#endif + + //----- assemble lines into blocks + + if (rawOrder) { + + lines = lineList; + for (line1 = lines; line1; line1 = line1->next) { + line1->xSpaceL = 0; + line1->xSpaceR = pageWidth; + } + + } else { + + // sort lines into yx order + lines = NULL; + while (lineList) { + line0 = lineList; + lineList = lineList->next; + for (line1 = NULL, line2 = lines; + line2 && !line0->yxBefore(line2); + line1 = line2, line2 = line2->next) ; + if (line1) { + line1->next = line0; + } else { + lines = line0; + } + line0->next = line2; + } + + // compute whitespace to left and right of each line + line0 = lines; + for (line1 = lines; line1; line1 = line1->next) { + + // find the first vertically overlapping line + for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ; + + // check each vertically overlapping line -- look for the nearest + // on each side + line1->xSpaceL = 0; + line1->xSpaceR = pageWidth; + for (line2 = line0; + line2 && line2->yMin < line1->yMax; + line2 = line2->next) { + if (line2->yMax > line1->yMin) { + if (line2->xMax < line1->xMin) { + if (line2->xMax > line1->xSpaceL) { + line1->xSpaceL = line2->xMax; + } + } else if (line2->xMin > line1->xMax) { + if (line2->xMin < line1->xSpaceR) { + line1->xSpaceR = line2->xMin; + } + } + } + } + } + } // (!rawOrder) + +#if 0 // for debugging + printf("*** lines in yx order ***\n"); + for (line0 = lines; line0; line0 = line0->next) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + printf("\n"); + fflush(stdout); +#endif + + lineList = lines; + yxBlocks = NULL; + blk0 = NULL; + while (lineList) { + + // build a new block object + line0 = lineList; + lineList = lineList->next; + line0->next = NULL; + blk1 = new TextBlock(); + blk1->lines = line0; + blk1->xMin = line0->xMin; + blk1->xMax = line0->xMax; + blk1->yMin = line0->yMin; + blk1->yMax = line0->yMax; + blk1->xSpaceL = line0->xSpaceL; + blk1->xSpaceR = line0->xSpaceR; + blk1->maxFontSize = line0->fontSize; + + // find subsequent lines in the block + while (lineList) { + + // look for the first horizontally overlapping line below this + // one + yLimit = line0->yMax + blkMaxSpacing * line0->fontSize; + line3 = line4 = NULL; + if (rawOrder) { + if (lineList->yMin < yLimit && + lineList->xMax > blk1->xMin && + lineList->xMin < blk1->xMax) { + line3 = NULL; + line4 = lineList; + } + } else { + for (line1 = NULL, line2 = lineList; + line2 && line2->yMin < yLimit; + line1 = line2, line2 = line2->next) { + if (line2->xMax > blk1->xMin && + line2->xMin < blk1->xMax) { + line3 = line1; + line4 = line2; + break; + } + } + } + + // if there is an overlapping line and it fits in the block, add + // it to the block + if (line4 && blockFit(blk1, line4)) { + if (line3) { + line3->next = line4->next; + } else { + lineList = line4->next; + } + line0->next = line0->flowNext = line4; + line4->next = NULL; + if (line4->xMin < blk1->xMin) { + blk1->xMin = line4->xMin; + } else if (line4->xMax > blk1->xMax) { + blk1->xMax = line4->xMax; + } + if (line4->yMax > blk1->yMax) { + blk1->yMax = line4->yMax; + } + if (line4->xSpaceL > blk1->xSpaceL) { + blk1->xSpaceL = line4->xSpaceL; + } + if (line4->xSpaceR < blk1->xSpaceR) { + blk1->xSpaceR = line4->xSpaceR; + } + if (line4->fontSize > blk1->maxFontSize) { + blk1->maxFontSize = line4->fontSize; + } + line0 = line4; + + // otherwise, we're done with this block + } else { + break; + } + } + + // insert block on list, in yx order + if (rawOrder) { + blk2 = blk0; + blk3 = NULL; + blk0 = blk1; + } else { + for (blk2 = NULL, blk3 = yxBlocks; + blk3 && !blk1->yxBefore(blk3); + blk2 = blk3, blk3 = blk3->next) ; + } + blk1->next = blk3; + if (blk2) { + blk2->next = blk1; + } else { + yxBlocks = blk1; + } + } + +#if 0 // for debugging + printf("*** blocks in yx order ***\n"); + for (blk0 = yxBlocks; blk0; blk0 = blk0->next) { + printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", + blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); + for (line0 = blk0->lines; line0; line0 = line0->next) { + printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + } + printf("\n"); + fflush(stdout); +#endif + + //----- merge lines and blocks, sort blocks into reading order -//------------------------------------------------------------------------ -// Character substitutions -//------------------------------------------------------------------------ + if (rawOrder) { + blocks = yxBlocks; -static char *isoLatin1Subst[] = { - "L", // Lslash - "OE", // OE - "S", // Scaron - "Y", // Ydieresis - "Z", // Zcaron - "fi", "fl", // ligatures - "ff", "ffi", "ffl", // ligatures - "i", // dotlessi - "l", // lslash - "oe", // oe - "s", // scaron - "z", // zcaron - "*", // bullet - "...", // ellipsis - "-", "-", // emdash, hyphen - "\"", "\"", // quotedblleft, quotedblright - "'", // quotesingle - "TM" // trademark -}; - -static char *ascii7Subst[] = { - "A", "A", "A", "A", // A{acute,circumflex,dieresis,grave} - "A", "A", // A{ring,tilde} - "AE", // AE - "C", // Ccedilla - "E", "E", "E", "E", // E{acute,circumflex,dieresis,grave} - "I", "I", "I", "I", // I{acute,circumflex,dieresis,grave} - "L", // Lslash - "N", // Ntilde - "O", "O", "O", "O", // O{acute,circumflex,dieresis,grave} - "O", "O", // O{slash,tilde} - "OE", // OE - "S", // Scaron - "U", "U", "U", "U", // U{acute,circumflex,dieresis,grave} - "Y", "Y", // T{acute,dieresis} - "Z", // Zcaron - "a", "a", "a", "a", // a{acute,circumflex,dieresis,grave} - "a", "a", // a{ring,tilde} - "ae", // ae - "c", // ccedilla - "e", "e", "e", "e", // e{acute,circumflex,dieresis,grave} - "fi", "fl", // ligatures - "ff", "ffi", "ffl", // ligatures - "i", // dotlessi - "i", "i", "i", "i", // i{acute,circumflex,dieresis,grave} - "l", // lslash - "n", // ntilde - "o", "o", "o", "o", // o{acute,circumflex,dieresis,grave} - "o", "o", // o{slash,tilde} - "oe", // oe - "s", // scaron - "u", "u", "u", "u", // u{acute,circumflex,dieresis,grave} - "y", "y", // t{acute,dieresis} - "z", // zcaron - "|", // brokenbar - "*", // bullet - "...", // ellipsis - "-", "-", "-", // emdash, endash, hyphen - "\"", "\"", // quotedblleft, quotedblright - "'", // quotesingle - "(R)", // registered - "TM" // trademark -}; + } else { + blocks = NULL; + blk0 = NULL; + blkStack = NULL; + while (yxBlocks) { + + // find the next two blocks: + // - if the depth-first traversal stack is empty, take the first + // (upper-left-most) two blocks on the yx-sorted block list + // - otherwise, find the two upper-left-most blocks under the top + // block on the stack + if (blkStack) { + blk3 = blk4 = blk5 = blk6 = NULL; + for (blk1 = NULL, blk2 = yxBlocks; + blk2; + blk1 = blk2, blk2 = blk2->next) { + if (blk2->yMin > blkStack->yMin && + blk2->xMax > blkStack->xMin && + blk2->xMin < blkStack->xMax) { + if (!blk4 || blk2->yxBefore(blk4)) { + blk5 = blk3; + blk6 = blk4; + blk3 = blk1; + blk4 = blk2; + } else if (!blk6 || blk2->yxBefore(blk6)) { + blk5 = blk1; + blk6 = blk2; + } + } + } + } else { + blk3 = NULL; + blk4 = yxBlocks; + blk5 = yxBlocks; + blk6 = yxBlocks->next; + } -//------------------------------------------------------------------------ -// 16-bit fonts -//------------------------------------------------------------------------ + // merge case 1: + // | | | + // | blkStack | | blkStack + // +---------------------+ --> +-------------- + // +------+ +------+ +-----------+ + // | blk4 | | blk6 | ... | blk4+blk6 | + // +------+ +------+ +-----------+ + yLimit = 0; // make gcc happy + if (blkStack) { + yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize; + } + if (blkStack && blk4 && blk6 && + !blk4->lines->next && !blk6->lines->next && + lineFit2(blk4->lines, blk6->lines) && + blk4->yMin < yLimit && + blk4->xMin > blkStack->xSpaceL && + blkStack->xMin > blk4->xSpaceL && + blk6->xMax < blkStack->xSpaceR) { + blk4->mergeRight(blk6); + if (blk5) { + blk5->next = blk6->next; + } else { + yxBlocks = blk6->next; + } + delete blk6; + + // merge case 2: + // | | | | + // | blkStack | | | + // +---------------------+ --> | blkStack+blk2 | + // +---------------------+ | | + // | blk4 | | | + // | | | | + } else if (blkStack && blk4 && + blk4->yMin < yLimit && + blockFit2(blkStack, blk4)) { + blkStack->mergeBelow(blk4); + if (blk3) { + blk3->next = blk4->next; + } else { + yxBlocks = blk4->next; + } + delete blk4; + + // if any of: + // 1. no block found + // 2. non-fully overlapping block found + // 3. large vertical gap above the overlapping block + // then pop the stack and try again + } else if (!blk4 || + (blkStack && (blk4->xMin < blkStack->xSpaceL || + blk4->xMax > blkStack->xSpaceR || + blk4->yMin - blkStack->yMax > + blkMaxSortSpacing * blkStack->maxFontSize))) { + blkStack = blkStack->stackNext; + + // add a block to the sorted list + } else { -#if JAPANESE_SUPPORT - -// CID 0 .. 96 -static Gushort japan12Map[96] = { - 0x2120, 0x2120, 0x212a, 0x2149, 0x2174, 0x2170, 0x2173, 0x2175, // 00 .. 07 - 0x2147, 0x214a, 0x214b, 0x2176, 0x215c, 0x2124, 0x213e, 0x2123, // 08 .. 0f - 0x213f, 0x2330, 0x2331, 0x2332, 0x2333, 0x2334, 0x2335, 0x2336, // 10 .. 17 - 0x2337, 0x2338, 0x2339, 0x2127, 0x2128, 0x2163, 0x2161, 0x2164, // 18 .. 1f - 0x2129, 0x2177, 0x2341, 0x2342, 0x2343, 0x2344, 0x2345, 0x2346, // 20 .. 27 - 0x2347, 0x2348, 0x2349, 0x234a, 0x234b, 0x234c, 0x234d, 0x234e, // 28 .. 2f - 0x234f, 0x2350, 0x2351, 0x2352, 0x2353, 0x2354, 0x2355, 0x2356, // 30 .. 37 - 0x2357, 0x2358, 0x2359, 0x235a, 0x214e, 0x216f, 0x214f, 0x2130, // 38 .. 3f - 0x2132, 0x2146, 0x2361, 0x2362, 0x2363, 0x2364, 0x2365, 0x2366, // 40 .. 47 - 0x2367, 0x2368, 0x2369, 0x236a, 0x236b, 0x236c, 0x236d, 0x236e, // 48 .. 4f - 0x236f, 0x2370, 0x2371, 0x2372, 0x2373, 0x2374, 0x2375, 0x2376, // 50 .. 57 - 0x2377, 0x2378, 0x2379, 0x237a, 0x2150, 0x2143, 0x2151, 0x2141 // 58 .. 5f -}; - -// CID 325 .. 421 -static Gushort japan12KanaMap1[97] = { - 0x2131, 0x2121, 0x2123, 0x2156, 0x2157, 0x2122, 0x2126, 0x2572, - 0x2521, 0x2523, 0x2525, 0x2527, 0x2529, 0x2563, 0x2565, 0x2567, - 0x2543, 0x213c, 0x2522, 0x2524, 0x2526, 0x2528, 0x252a, 0x252b, - 0x252d, 0x252f, 0x2531, 0x2533, 0x2535, 0x2537, 0x2539, 0x253b, - 0x253d, 0x253f, 0x2541, 0x2544, 0x2546, 0x2548, 0x254a, 0x254b, - 0x254c, 0x254d, 0x254e, 0x254f, 0x2552, 0x2555, 0x2558, 0x255b, - 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2564, 0x2566, 0x2568, - 0x2569, 0x256a, 0x256b, 0x256c, 0x256d, 0x256f, 0x2573, 0x212b, - 0x212c, 0x212e, 0x2570, 0x2571, 0x256e, 0x2575, 0x2576, 0x2574, - 0x252c, 0x252e, 0x2530, 0x2532, 0x2534, 0x2536, 0x2538, 0x253a, - 0x253c, 0x253e, 0x2540, 0x2542, 0x2545, 0x2547, 0x2549, 0x2550, - 0x2551, 0x2553, 0x2554, 0x2556, 0x2557, 0x2559, 0x255a, 0x255c, - 0x255d -}; - -// CID 501 .. 598 -static Gushort japan12KanaMap2[98] = { - 0x212d, 0x212f, 0x216d, 0x214c, 0x214d, 0x2152, 0x2153, 0x2154, - 0x2155, 0x2158, 0x2159, 0x215a, 0x215b, 0x213d, 0x2121, 0x2472, - 0x2421, 0x2423, 0x2425, 0x2427, 0x2429, 0x2463, 0x2465, 0x2467, - 0x2443, 0x2422, 0x2424, 0x2426, 0x2428, 0x242a, 0x242b, 0x242d, - 0x242f, 0x2431, 0x2433, 0x2435, 0x2437, 0x2439, 0x243b, 0x243d, - 0x243f, 0x2441, 0x2444, 0x2446, 0x2448, 0x244a, 0x244b, 0x244c, - 0x244d, 0x244e, 0x244f, 0x2452, 0x2455, 0x2458, 0x245b, 0x245e, - 0x245f, 0x2460, 0x2461, 0x2462, 0x2464, 0x2466, 0x2468, 0x2469, - 0x246a, 0x246b, 0x246c, 0x246d, 0x246f, 0x2473, 0x2470, 0x2471, - 0x246e, 0x242c, 0x242e, 0x2430, 0x2432, 0x2434, 0x2436, 0x2438, - 0x243a, 0x243c, 0x243e, 0x2440, 0x2442, 0x2445, 0x2447, 0x2449, - 0x2450, 0x2451, 0x2453, 0x2454, 0x2456, 0x2457, 0x2459, 0x245a, - 0x245c, 0x245d -}; - -static char *japan12Roman[10] = { - "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X" -}; - -static char *japan12Abbrev1[6] = { - "mm", "cm", "km", "mg", "kg", "cc" -}; + // remove the block from the yx-sorted list + if (blk3) { + blk3->next = blk4->next; + } else { + yxBlocks = blk4->next; + } + blk4->next = NULL; + + // append the block to the reading-order list + if (blk0) { + blk0->next = blk4; + } else { + blocks = blk4; + } + blk0 = blk4; + // push the block on the traversal stack + if (!physLayout) { + blk4->stackNext = blkStack; + blkStack = blk4; + } + } + } + } // (!rawOrder) + +#if 0 // for debugging + printf("*** blocks in reading order (after merging) ***\n"); + for (blk0 = blocks; blk0; blk0 = blk0->next) { + printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", + blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); + for (line0 = blk0->lines; line0; line0 = line0->next) { + printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + } + printf("\n"); + fflush(stdout); #endif -//------------------------------------------------------------------------ -// TextString -//------------------------------------------------------------------------ + //----- assemble blocks into flows -TextString::TextString(GfxState *state, GBool hexCodes1) { - double x, y, h; + if (rawOrder) { - state->transform(state->getCurX(), state->getCurY(), &x, &y); - h = state->getTransformedFontSize(); - //~ yMin/yMax computation should use font ascent/descent values - yMin = y - 0.95 * h; - yMax = yMin + 1.3 * h; - col = 0; - text = new GString(); - xRight = NULL; - yxNext = NULL; - xyNext = NULL; - hexCodes = hexCodes1; -} + // one flow per block + flow0 = NULL; + while (blocks) { + flow1 = new TextFlow(); + flow1->blocks = blocks; + flow1->lines = blocks->lines; + flow1->yMin = blocks->yMin; + flow1->yMax = blocks->yMax; + blocks = blocks->next; + flow1->blocks->next = NULL; + if (flow0) { + flow0->next = flow1; + } else { + flows = flow1; + } + flow0 = flow1; + } -TextString::~TextString() { - delete text; - gfree(xRight); -} + } else { -void TextString::addChar(GfxState *state, double x, double y, - double dx, double dy, - Guchar c, GBool useASCII7) { - char *charName, *sub; - int c1; - int i, j, n, m; - - // get current index - i = text->getLength(); - - // append translated character(s) to string - sub = NULL; - n = 1; - if ((charName = state->getFont()->getCharName(c))) { - if (useASCII7) - c1 = ascii7Encoding.getCharCode(charName); - else - c1 = isoLatin1Encoding.getCharCode(charName); - if (c1 < 0) { - m = strlen(charName); - if (hexCodes && m == 3 && - (charName[0] == 'B' || charName[0] == 'C' || - charName[0] == 'G') && - isxdigit(charName[1]) && isxdigit(charName[2])) { - sscanf(charName+1, "%x", &c1); - } else if (!hexCodes && m >= 2 && m <= 3 && - isdigit(charName[0]) && isdigit(charName[1])) { - c1 = atoi(charName); - if (c1 >= 256) - c1 = -1; - } else if (!hexCodes && m >= 3 && m <= 5 && isdigit(charName[1])) { - c1 = atoi(charName+1); - if (c1 >= 256) - c1 = -1; - } - //~ this is a kludge -- is there a standard internal encoding - //~ used by all/most Type 1 fonts? - if (c1 == 262) // hyphen - c1 = 45; - else if (c1 == 266) // emdash - c1 = 208; - if (useASCII7) - c1 = ascii7Encoding.getCharCode(isoLatin1Encoding.getCharName(c1)); - } - if (useASCII7) { - if (c1 >= 128) { - sub = ascii7Subst[c1 - 128]; - n = strlen(sub); - } - } else { - if (c1 >= 256) { - sub = isoLatin1Subst[c1 - 256]; - n = strlen(sub); + // compute whitespace above and below each block + for (blk0 = blocks; blk0; blk0 = blk0->next) { + blk0->ySpaceT = 0; + blk0->ySpaceB = pageHeight; + + // check each horizontally overlapping block + for (blk1 = blocks; blk1; blk1 = blk1->next) { + if (blk1 != blk0 && + blk1->xMin < blk0->xMax && + blk1->xMax > blk0->xMin) { + if (blk1->yMax < blk0->yMin) { + if (blk1->yMax > blk0->ySpaceT) { + blk0->ySpaceT = blk1->yMax; + } + } else if (blk1->yMin > blk0->yMax) { + if (blk1->yMin < blk0->ySpaceB) { + blk0->ySpaceB = blk1->yMin; + } + } + } } } - } else { - c1 = -1; - } - if (sub) - text->append(sub); - else if (c1 >= 0) - text->append((char)c1); - else - text->append(' '); - - // update position information - if (i+n > ((i+15) & ~15)) - xRight = (double *)grealloc(xRight, ((i+n+15) & ~15) * sizeof(double)); - if (i == 0) - xMin = x; - for (j = 0; j < n; ++j) - xRight[i+j] = x + ((j+1) * dx) / n; - xMax = x + dx; -} - -void TextString::addChar16(GfxState *state, double x, double y, - double dx, double dy, - int c, GfxFontCharSet16 charSet) { - int c1, t1, t2; - int sub[8]; - char *p; - int *q; - int i, j, n; - - // get current index - i = text->getLength(); - - // convert the 16-bit character - c1 = 0; - sub[0] = 0; - switch (charSet) { - - // convert Adobe-Japan1-2 to JIS X 0208-1983 - case font16AdobeJapan12: -#if JAPANESE_SUPPORT - if (c <= 96) { - c1 = 0x8080 + japan12Map[c]; - } else if (c <= 632) { - if (c <= 230) - c1 = 0; - else if (c <= 324) - c1 = 0x8080 + japan12Map[c - 230]; - else if (c <= 421) - c1 = 0x8080 + japan12KanaMap1[c - 325]; - else if (c <= 500) - c1 = 0; - else if (c <= 598) - c1 = 0x8080 + japan12KanaMap2[c - 501]; - else - c1 = 0; - } else if (c <= 1124) { - if (c <= 779) { - if (c <= 726) - c1 = 0xa1a1 + (c - 633); - else if (c <= 740) - c1 = 0xa2a1 + (c - 727); - else if (c <= 748) - c1 = 0xa2ba + (c - 741); - else if (c <= 755) - c1 = 0xa2ca + (c - 749); - else if (c <= 770) - c1 = 0xa2dc + (c - 756); - else if (c <= 778) - c1 = 0xa2f2 + (c - 771); - else - c1 = 0xa2fe; - } else if (c <= 841) { - if (c <= 789) - c1 = 0xa3b0 + (c - 780); - else if (c <= 815) - c1 = 0xa3c1 + (c - 790); - else - c1 = 0xa3e1 + (c - 816); - } else if (c <= 1010) { - if (c <= 924) - c1 = 0xa4a1 + (c - 842); - else - c1 = 0xa5a1 + (c - 925); - } else { - if (c <= 1034) - c1 = 0xa6a1 + (c - 1011); - else if (c <= 1058) - c1 = 0xa6c1 + (c - 1035); - else if (c <= 1091) - c1 = 0xa7a1 + (c - 1059); - else - c1 = 0xa7d1 + (c - 1092); - } - } else if (c <= 4089) { - t1 = (c - 1125) / 94; - t2 = (c - 1125) % 94; - c1 = 0xb0a1 + (t1 << 8) + t2; - } else if (c <= 7477) { - t1 = (c - 4090) / 94; - t2 = (c - 4090) % 94; - c1 = 0xd0a1 + (t1 << 8) + t2; - } else if (c <= 7554) { - c1 = 0; - } else if (c <= 7563) { // circled Arabic numbers 1..9 - c1 = 0xa3b1 + (c - 7555); - } else if (c <= 7574) { // circled Arabic numbers 10..20 - t1 = c - 7564 + 10; - sub[0] = 0xa3b0 + (t1 / 10); - sub[1] = 0xa3b0 + (t1 % 10); - sub[2] = 0; - c1 = -1; - } else if (c <= 7584) { // Roman numbers I..X - for (p = japan12Roman[c - 7575], q = sub; *p; ++p, ++q) { - *q = 0xa380 + *p; - } - *q = 0; - c1 = -1; - } else if (c <= 7632) { - if (c <= 7600) { - c1 = 0; - } else if (c <= 7606) { - for (p = japan12Abbrev1[c - 7601], q = sub; *p; ++p, ++q) { - *q = 0xa380 + *p; - } - *q = 0; - c1 = -1; + + flow0 = NULL; + while (blocks) { + + // build a new flow object + flow1 = new TextFlow(); + flow1->blocks = blocks; + flow1->lines = blocks->lines; + flow1->yMin = blocks->yMin; + flow1->yMax = blocks->yMax; + flow1->ySpaceT = blocks->ySpaceT; + flow1->ySpaceB = blocks->ySpaceB; + + // find subsequent blocks in the flow + for (blk1 = blocks, blk2 = blocks->next; + blk2 && flowFit(flow1, blk2); + blk1 = blk2, blk2 = blk2->next) { + if (blk2->yMin < flow1->yMin) { + flow1->yMin = blk2->yMin; + } + if (blk2->yMax > flow1->yMax) { + flow1->yMax = blk2->yMax; + } + if (blk2->ySpaceT > flow1->ySpaceT) { + flow1->ySpaceT = blk2->ySpaceT; + } + if (blk2->ySpaceB < flow1->ySpaceB) { + flow1->ySpaceB = blk2->ySpaceB; + } + for (line1 = blk1->lines; line1->next; line1 = line1->next) ; + line1->flowNext = blk2->lines; + } + + // chop the block list + blocks = blk1->next; + blk1->next = NULL; + + // append the flow to the list + if (flow0) { + flow0->next = flow1; } else { - c1 = 0; + flows = flow1; } - } else { - c1 = 0; + flow0 = flow1; } -#endif // JAPANESE_SUPPORT - break; } - // append converted character to string - if (c1 == 0) { -#if 0 //~ - error(-1, "Unsupported Adobe-Japan1-2 character: %d", c); +#if 0 // for debugging + printf("*** flows ***\n"); + for (flow0 = flows; flow0; flow0 = flow0->next) { + printf("[flow]\n"); + for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) { + printf(" [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n", + blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax, + blk0->ySpaceT, blk0->ySpaceB); + for (line0 = blk0->lines; line0; line0 = line0->next) { + printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + } + } + printf("\n"); + fflush(stdout); #endif - text->append(' '); - n = 1; - } else if (c1 > 0) { - text->append(c1 >> 8); - text->append(c1 & 0xff); - n = 2; + + //----- sort lines into yx order + + // (the block/line merging process doesn't maintain the full-page + // linked list of lines) + + lines = NULL; + if (rawOrder) { + line0 = NULL; + for (flow0 = flows; flow0; flow0 = flow0->next) { + for (line1 = flow0->lines; line1; line1 = line1->flowNext) { + if (line0) { + line0->pageNext = line1; + } else { + lines = line1; + } + line0 = line1; + } + } } else { - n = 0; - for (q = sub; *q; ++q) { - text->append(*q >> 8); - text->append(*q & 0xff); - n += 2; + for (flow0 = flows; flow0; flow0 = flow0->next) { + for (line0 = flow0->lines; line0; line0 = line0->flowNext) { + for (line1 = NULL, line2 = lines; + line2 && !line0->yxBefore(line2); + line1 = line2, line2 = line2->pageNext) ; + if (line1) { + line1->pageNext = line0; + } else { + lines = line0; + } + line0->pageNext = line2; + } } } - // update position information - if (i+n > ((i+15) & ~15)) { - xRight = (double *)grealloc(xRight, ((i+n+15) & ~15) * sizeof(double)); - } - if (i == 0) { - xMin = x; - } - for (j = 0; j < n; ++j) { - xRight[i+j] = x + dx; +#if 0 // for debugging + printf("*** lines in yx order ***\n"); + for (line0 = lines; line0; line0 = line0->pageNext) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0], + line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } } - xMax = x + dx; + printf("\n"); + fflush(stdout); +#endif } -//------------------------------------------------------------------------ -// TextPage -//------------------------------------------------------------------------ +// If can be added the end of , return the absolute value +// of the difference between 's baseline and 's baseline, +// and set * to the horizontal space between the current last +// word in and . A smaller return value indicates a +// better fit. Otherwise, return a negative number. +double TextPage::lineFit(TextLine *line, TextWord *word, double *space) { + TextWord *lastWord; + double fontSize0, fontSize1; + double dx, dxLimit; + + lastWord = line->lastWord; + fontSize0 = line->fontSize; + fontSize1 = word->fontSize; + dx = word->xMin - lastWord->xMax; + dxLimit = fontSize0 * lastWord->font->maxSpaceWidth; + + // check inter-word spacing + if (dx < fontSize0 * lineMinDeltaX || + dx > dxLimit) { + return -1; + } -TextPage::TextPage(GBool useASCII7, GBool rawOrder) { - this->useASCII7 = useASCII7; - this->rawOrder = rawOrder; - curStr = NULL; - yxStrings = NULL; - xyStrings = NULL; - yxCur1 = yxCur2 = NULL; -} + if ( + // look for adjacent words with close baselines and close font sizes + (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 && + fontSize0 < lineMaxFontSizeRatio * fontSize1 && + fontSize1 < lineMaxFontSizeRatio * fontSize0) || + + // look for a superscript + (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 && + fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 && + (word->yMax < lastWord->yMax || + word->yBase < lastWord->yBase) && + word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 && + dx < fontSize0 * lineMaxSuperscriptDeltaX) || + + // look for a subscript + (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 && + fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 && + (word->yMin > lastWord->yMin || + word->yBase > lastWord->yBase) && + line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 && + dx < fontSize0 * lineMaxSubscriptDeltaX)) { + + *space = dx; + return fabs(word->yBase - line->yBase); + } -TextPage::~TextPage() { - clear(); + return -1; } -void TextPage::beginString(GfxState *state, GString *s, GBool hexCodes) { - curStr = new TextString(state, hexCodes); -} +// Returns true if and can be merged into a single +// line, ignoring max word spacing. +GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) { + double fontSize0, fontSize1; + double dx; -void TextPage::addChar(GfxState *state, double x, double y, - double dx, double dy, Guchar c) { - double x1, y1, w1, h1, dx2, dy2; - int n; - GBool hexCodes; + fontSize0 = line0->fontSize; + fontSize1 = line1->fontSize; + dx = line1->xMin - line0->xMax; - state->transform(x, y, &x1, &y1); - state->textTransformDelta(state->getCharSpace(), 0, &dx2, &dy2); - dx -= dx2; - dy -= dy2; - state->transformDelta(dx, dy, &w1, &h1); - n = curStr->text->getLength(); - if (n > 0 && - x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) { - hexCodes = curStr->hexCodes; - endString(); - beginString(state, NULL, hexCodes); + // check inter-word spacing + if (dx < fontSize0 * lineMinDeltaX) { + return gFalse; } - curStr->addChar(state, x1, y1, w1, h1, c, useASCII7); -} -void TextPage::addChar16(GfxState *state, double x, double y, - double dx, double dy, int c, - GfxFontCharSet16 charSet) { - double x1, y1, w1, h1, dx2, dy2; - int n; - GBool hexCodes; - - state->transform(x, y, &x1, &y1); - state->textTransformDelta(state->getCharSpace(), 0, &dx2, &dy2); - dx -= dx2; - dy -= dy2; - state->transformDelta(dx, dy, &w1, &h1); - n = curStr->text->getLength(); - if (n > 0 && - x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) { - hexCodes = curStr->hexCodes; - endString(); - beginString(state, NULL, hexCodes); + // look for close baselines and close font sizes + if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 && + fontSize0 < lineMaxFontSizeRatio * fontSize1 && + fontSize1 < lineMaxFontSizeRatio * fontSize0) { + return gTrue; } - curStr->addChar16(state, x1, y1, w1, h1, c, charSet); + + return gFalse; } -void TextPage::endString() { - TextString *p1, *p2; - double h, y1, y2; +// Returns true if can be added to . Assumes the y +// coordinates are within range. +GBool TextPage::blockFit(TextBlock *blk, TextLine *line) { + double fontSize0, fontSize1; + + // check edges + if (line->xMin < blk->xSpaceL || + line->xMax > blk->xSpaceR || + blk->xMin < line->xSpaceL || + blk->xMax > line->xSpaceR) { + return gFalse; + } - // throw away zero-length strings -- they don't have valid xMin/xMax - // values, and they're useless anyway - if (curStr->text->getLength() == 0) { - delete curStr; - curStr = NULL; - return; + // check font sizes + fontSize0 = blk->lines->fontSize; + fontSize1 = line->fontSize; + if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || + fontSize1 > blkMaxFontSizeRatio * fontSize0) { + return gFalse; } -#if 0 //~tmp - if (curStr->yMax - curStr->yMin > 20) { - delete curStr; - curStr = NULL; - return; + return gTrue; +} + +// Returns true if and can be merged into a single +// block. Assumes the y coordinates are within range. +GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) { + double fontSize0, fontSize1; + + // check edges + if (blk1->xMin < blk0->xSpaceL || + blk1->xMax > blk0->xSpaceR || + blk0->xMin < blk1->xSpaceL || + blk0->xMax > blk1->xSpaceR) { + return gFalse; } -#endif - // insert string in y-major list - h = curStr->yMax - curStr->yMin; - y1 = curStr->yMin + 0.5 * h; - y2 = curStr->yMin + 0.8 * h; - if (rawOrder) { - p1 = yxCur1; - p2 = NULL; - } else if ((!yxCur1 || - (y1 >= yxCur1->yMin && - (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) && - (!yxCur2 || - (y1 < yxCur2->yMin || - (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) { - p1 = yxCur1; - p2 = yxCur2; - } else { - for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) { - if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin)) - break; - } - yxCur2 = p2; + // check font sizes + fontSize0 = blk0->lines->fontSize; + fontSize1 = blk1->lines->fontSize; + if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || + fontSize1 > blkMaxFontSizeRatio * fontSize0) { + return gFalse; } - yxCur1 = curStr; - if (p1) - p1->yxNext = curStr; - else - yxStrings = curStr; - curStr->yxNext = p2; - curStr = NULL; + + return gTrue; } -void TextPage::coalesce() { - TextString *str1, *str2; - double space, d; - int n, i; +// Returns true if can be added to . +GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) { + double dy; -#if 0 //~ for debugging - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - printf("x=%3d..%3d y=%3d..%3d size=%2d '%s'\n", - (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax, - (int)(str1->yMax - str1->yMin), str1->text->getCString()); - } - printf("\n------------------------------------------------------------\n\n"); -#endif - str1 = yxStrings; - while (str1 && (str2 = str1->yxNext)) { - space = str1->yMax - str1->yMin; - d = str2->xMin - str1->xMax; -#if 0 //~tmp - if (((rawOrder && - ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) || - (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) || - (!rawOrder && str2->yMin < str1->yMax)) && - d > -0.1 * space && d < 0.2 * space) { -#else - if (((rawOrder && - ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) || - (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) || - (!rawOrder && str2->yMin < str1->yMax)) && - d > -0.5 * space && d < space) { -#endif - n = str1->text->getLength(); - if (d > 0.1 * space) - str1->text->append(' '); - str1->text->append(str2->text); - str1->xRight = (double *) - grealloc(str1->xRight, str1->text->getLength() * sizeof(double)); - if (d > 0.1 * space) - str1->xRight[n++] = str2->xMin; - for (i = 0; i < str2->text->getLength(); ++i) - str1->xRight[n++] = str2->xRight[i]; - if (str2->xMax > str1->xMax) - str1->xMax = str2->xMax; - if (str2->yMax > str1->yMax) - str1->yMax = str2->yMax; - str1->yxNext = str2->yxNext; - delete str2; - } else { - str1 = str2; - } + // check whitespace above and below + if (blk->yMin < flow->ySpaceT || + blk->yMax > flow->ySpaceB || + flow->yMin < blk->ySpaceT || + flow->yMax > blk->ySpaceB) { + return gFalse; } + + // check that block top edge is within +/- dy of flow top edge, + // and that block bottom edge is above flow bottom edge + dy + dy = flowMaxDeltaY * flow->blocks->maxFontSize; + return blk->yMin > flow->yMin - dy && + blk->yMin < flow->yMin + dy && + blk->yMax < flow->yMax + dy; } -GBool TextPage::findText(char *s, GBool top, GBool bottom, + +GBool TextPage::findText(Unicode *s, int len, + GBool top, GBool bottom, double *xMin, double *yMin, double *xMax, double *yMax) { - TextString *str; - char *p, *p1, *q; - int n, m, i; - double x; + TextLine *line; + Unicode *p; + Unicode u1, u2; + int m, i, j; + double x0, x1, x; - // scan all strings on page - n = strlen(s); - for (str = yxStrings; str; str = str->yxNext) { + // scan all text on the page + for (line = lines; line; line = line->pageNext) { // check: above top limit? - if (!top && (str->yMax < *yMin || - (str->yMin < *yMin && str->xMax <= *xMin))) + if (!top && (line->yMax < *yMin || + (line->yMin < *yMin && line->xMax <= *xMin))) { continue; + } // check: below bottom limit? - if (!bottom && (str->yMin > *yMax || - (str->yMax > *yMax && str->xMin >= *xMax))) + if (!bottom && (line->yMin > *yMax || + (line->yMax > *yMax && line->xMin >= *xMax))) { return gFalse; + } + + // search each position in this line + m = line->len; + for (i = 0, p = line->text; i <= m - len; ++i, ++p) { - // search each position in this string - m = str->text->getLength(); - for (i = 0, p = str->text->getCString(); i <= m - n; ++i, ++p) { + x0 = (i == 0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + x = 0.5 * (x0 + x1); // check: above top limit? - if (!top && str->yMin < *yMin) { - x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; - if (x < *xMin) + if (!top && line->yMin < *yMin) { + if (x < *xMin) { continue; + } } // check: below bottom limit? - if (!bottom && str->yMax > *yMax) { - x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; - if (x > *xMax) + if (!bottom && line->yMax > *yMax) { + if (x > *xMax) { return gFalse; + } } // compare the strings - for (p1 = p, q = s; *q; ++p1, ++q) { - if (tolower(*p1) != tolower(*q)) + for (j = 0; j < len; ++j) { +#if 1 //~ this lowercases Latin A-Z only -- this will eventually be + //~ extended to handle other character sets + if (p[j] >= 0x41 && p[j] <= 0x5a) { + u1 = p[j] + 0x20; + } else { + u1 = p[j]; + } + if (s[j] >= 0x41 && s[j] <= 0x5a) { + u2 = s[j] + 0x20; + } else { + u2 = s[j]; + } +#endif + if (u1 != u2) { break; + } } // found it - if (!*q) { - *xMin = (i == 0) ? str->xMin : str->xRight[i-1]; - *xMax = str->xRight[i+n-1]; - *yMin = str->yMin; - *yMax = str->yMax; + if (j == len) { + *xMin = x0; + *xMax = line->xRight[i + len - 1]; + *yMin = line->yMin; + *yMax = line->yMax; return gTrue; } } } + return gFalse; } GString *TextPage::getText(double xMin, double yMin, double xMax, double yMax) { GString *s; - TextString *str1; - double x0, x1, x2, y; - double xPrev, yPrev; - int i1, i2; + UnicodeMap *uMap; + GBool isUnicode; + char space[8], eol[16], buf[8]; + int spaceLen, eolLen, len; + TextLine *line, *prevLine; + double x0, x1, y; + int firstCol, col, i; GBool multiLine; s = new GString(); - xPrev = yPrev = 0; - multiLine = gFalse; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - y = 0.5 * (str1->yMin + str1->yMax); - if (y > yMax) + + // get the output encoding + if (!(uMap = globalParams->getTextEncoding())) { + return s; + } + isUnicode = uMap->isUnicode(); + spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); + eolLen = 0; // make gcc happy + switch (globalParams->getTextEOL()) { + case eolUnix: + eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); + break; + case eolDOS: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); + break; + case eolMac: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + break; + } + + // find the leftmost column + firstCol = -1; + for (line = lines; line; line = line->pageNext) { + if (line->yMin > yMax) { break; - if (y > yMin && str1->xMin < xMax && str1->xMax > xMin) { - x0 = x1 = x2 = str1->xMin; - for (i1 = 0; i1 < str1->text->getLength(); ++i1) { - x0 = (i1==0) ? str1->xMin : str1->xRight[i1-1]; - x1 = str1->xRight[i1]; - if (0.5 * (x0 + x1) >= xMin) - break; - } - for (i2 = str1->text->getLength() - 1; i2 > i1; --i2) { - x1 = (i2==0) ? str1->xMin : str1->xRight[i2-1]; - x2 = str1->xRight[i2]; - if (0.5 * (x1 + x2) <= xMax) - break; - } - if (s->getLength() > 0) { - if (x0 < xPrev || str1->yMin > yPrev) { - s->append('\n'); - multiLine = gTrue; - } else { - s->append(" "); - } + } + if (line->yMax < yMin || + line->xMax < xMin || + line->xMin > xMax) { + continue; + } + + y = 0.5 * (line->yMin + line->yMax); + if (y < yMin || y > yMax) { + continue; + } + + i = 0; + while (i < line->len) { + x0 = (i==0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + if (0.5 * (x0 + x1) > xMin) { + break; } - s->append(str1->text->getCString() + i1, i2 - i1 + 1); - xPrev = x2; - yPrev = str1->yMax; + ++i; + } + if (i == line->len) { + continue; + } + col = line->col[i]; + + if (firstCol < 0 || col < firstCol) { + firstCol = col; } } - if (multiLine) - s->append('\n'); - return s; -} -void TextPage::dump(FILE *f) { - TextString *str1, *str2, *str3; - double yMin, yMax; - int col1, col2; - double d; - - // build x-major list - xyStrings = NULL; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - for (str2 = NULL, str3 = xyStrings; - str3; - str2 = str3, str3 = str3->xyNext) { - if (str1->xMin < str3->xMin || - (str1->xMin == str3->xMin && str1->yMin < str3->yMin)) + // extract the text + col = firstCol; + multiLine = gFalse; + prevLine = NULL; + for (line = lines; line; line = line->pageNext) { + if (line->yMin > yMax) { + break; + } + if (line->yMax < yMin || + line->xMax < xMin || + line->xMin > xMax) { + continue; + } + + y = 0.5 * (line->yMin + line->yMax); + if (y < yMin || y > yMax) { + continue; + } + + i = 0; + while (i < line->len) { + x0 = (i==0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + if (0.5 * (x0 + x1) > xMin) { break; + } + ++i; + } + if (i == line->len) { + continue; } - if (str2) - str2->xyNext = str1; - else - xyStrings = str1; - str1->xyNext = str3; - } - // do column assignment - for (str1 = xyStrings; str1; str1 = str1->xyNext) { - col1 = 0; - for (str2 = xyStrings; str2 != str1; str2 = str2->xyNext) { - if (str1->xMin >= str2->xMax) { - col2 = str2->col + str2->text->getLength() + 4; - if (col2 > col1) - col1 = col2; - } else if (str1->xMin > str2->xMin) { - col2 = str2->col + - (int)(((str1->xMin - str2->xMin) / (str2->xMax - str2->xMin)) * - str2->text->getLength()); - if (col2 > col1) { - col1 = col2; - } + // insert a return + if (line->col[i] < col || + (prevLine && + line->yMin > + prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) { + s->append(eol, eolLen); + col = firstCol; + multiLine = gTrue; + } + prevLine = line; + + // line this block up with the correct column + for (; col < line->col[i]; ++col) { + s->append(space, spaceLen); + } + + // print the portion of the line + for (; i < line->len; ++i) { + + x0 = (i==0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + if (0.5 * (x0 + x1) > xMax) { + break; } + + len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); + s->append(buf, len); + col += isUnicode ? 1 : len; } - str1->col = col1; } -#if 0 //~ for debugging - fprintf(f, "~~~~~~~~~~\n"); - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - fprintf(f, "(%4d,%4d) - (%4d,%4d) [%3d] %s\n", - (int)str1->xMin, (int)str1->yMin, (int)str1->xMax, (int)str1->yMax, - str1->col, str1->text->getCString()); + if (multiLine) { + s->append(eol, eolLen); } - fprintf(f, "~~~~~~~~~~\n"); -#endif - // output - col1 = 0; - yMax = yxStrings ? yxStrings->yMax : 0; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { + uMap->decRefCnt(); - // line this string up with the correct column - if (rawOrder && col1 == 0) { - col1 = str1->col; - } else { - for (; col1 < str1->col; ++col1) { - fputc(' ', f); + return s; +} + +GBool TextPage::findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax) { + TextLine *line; + TextWord *word; + double x; + GBool first; + int i; + + //~ this doesn't correctly handle: + //~ - ranges split across multiple lines (the highlighted region + //~ is the bounding box of all the parts of the range) + //~ - cases where characters don't convert one-to-one into Unicode + first = gTrue; + for (line = lines; line; line = line->pageNext) { + for (word = line->words; word; word = word->next) { + if (pos < word->charPos + word->charLen && + word->charPos < pos + length) { + i = pos - word->charPos; + if (i < 0) { + i = 0; + } + x = (i == 0) ? word->xMin : word->xRight[i - 1]; + if (first || x < *xMin) { + *xMin = x; + } + i = pos + length - word->charPos; + if (i >= word->len) { + i = word->len - 1; + } + x = word->xRight[i]; + if (first || x > *xMax) { + *xMax = x; + } + if (first || word->yMin < *yMin) { + *yMin = word->yMin; + } + if (first || word->yMax > *yMax) { + *yMax = word->yMax; + } + first = gFalse; } } + } + return !first; +} - // print the string - fputs(str1->text->getCString(), f); +void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, + GBool physLayout) { + UnicodeMap *uMap; + char space[8], eol[16], eop[8], buf[8]; + int spaceLen, eolLen, eopLen, len; + TextFlow *flow; + TextLine *line; + int col, d, n, i; + + // get the output encoding + if (!(uMap = globalParams->getTextEncoding())) { + return; + } + spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); + eolLen = 0; // make gcc happy + switch (globalParams->getTextEOL()) { + case eolUnix: + eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); + break; + case eolDOS: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); + break; + case eolMac: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + break; + } + eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop)); - // increment column - col1 += str1->text->getLength(); + // output the page, maintaining the original physical layout + if (physLayout || rawOrder) { + col = 0; + for (line = lines; line; line = line->pageNext) { - // update yMax for this line - if (str1->yMax > yMax) - yMax = str1->yMax; + // line this block up with the correct column + if (!rawOrder) { + for (; col < line->col[0]; ++col) { + (*outputFunc)(outputStream, space, spaceLen); + } + } - // if we've hit the end of the line... -#if 0 //~ - if (!(str1->yxNext && - !(rawOrder && str1->yxNext->yMax < str1->yMin) && - str1->yxNext->yMin < str1->yMax && - str1->yxNext->xMin >= str1->xMax)) { -#else - if (!(str1->yxNext && - !(rawOrder && str1->yxNext->yMax < str1->yMin) && - str1->yxNext->yMin < 0.2*str1->yMin + 0.8*str1->yMax && - str1->yxNext->xMin >= str1->xMax)) { -#endif + // print the line + for (i = 0; i < line->len; ++i) { + len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); + (*outputFunc)(outputStream, buf, len); + } + col += line->convertedLen; + + // print one or more returns if necessary + if (rawOrder || + !line->pageNext || + line->pageNext->col[0] < col || + line->pageNext->yMin > + line->yMax - lineOverlapSlack * line->fontSize) { + + // compute number of returns + d = 1; + if (line->pageNext) { + d += (int)((line->pageNext->yMin - line->yMax) / + line->fontSize + 0.5); + } - // print a return - fputc('\n', f); + // various things (weird font matrices) can result in bogus + // values here, so do a sanity check + if (d < 1) { + d = 1; + } else if (d > 5) { + d = 5; + } + for (; d > 0; --d) { + (*outputFunc)(outputStream, eol, eolLen); + } - // print extra vertical space if necessary - if (str1->yxNext) { + col = 0; + } + } - // find yMin for next line - yMin = str1->yxNext->yMin; - for (str2 = str1->yxNext; str2; str2 = str2->yxNext) { - if (str2->yMin < yMin) - yMin = str2->yMin; - if (!(str2->yxNext && str2->yxNext->yMin < str2->yMax && - str2->yxNext->xMin >= str2->xMax)) - break; + // output the page, "undoing" the layout + } else { + for (flow = flows; flow; flow = flow->next) { + for (line = flow->lines; line; line = line->flowNext) { + n = line->len; + if (line->flowNext && line->hyphenated) { + --n; } - - // print the space - d = (int)((yMin - yMax) / (str1->yMax - str1->yMin) + 0.5); - if (rawOrder && d > 2) { - d = 2; + for (i = 0; i < n; ++i) { + len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); + (*outputFunc)(outputStream, buf, len); } - for (; d > 0; --d) { - fputc('\n', f); + if (line->flowNext && !line->hyphenated) { + (*outputFunc)(outputStream, space, spaceLen); } } - - // set up for next line - col1 = 0; - yMax = str1->yxNext ? str1->yxNext->yMax : 0; + (*outputFunc)(outputStream, eol, eolLen); + (*outputFunc)(outputStream, eol, eolLen); } } + + // end of page + (*outputFunc)(outputStream, eop, eopLen); + (*outputFunc)(outputStream, eol, eolLen); + + uMap->decRefCnt(); +} + +void TextPage::startPage(GfxState *state) { + clear(); + if (state) { + pageWidth = state->getPageWidth(); + pageHeight = state->getPageHeight(); + } else { + pageWidth = pageHeight = 0; + } } void TextPage::clear() { - TextString *p1, *p2; + TextWord *w1, *w2; + TextFlow *f1, *f2; - if (curStr) { - delete curStr; - curStr = NULL; + if (curWord) { + delete curWord; + curWord = NULL; } - for (p1 = yxStrings; p1; p1 = p2) { - p2 = p1->yxNext; - delete p1; + if (words) { + for (w1 = words; w1; w1 = w2) { + w2 = w1->next; + delete w1; + } + } else if (flows) { + for (f1 = flows; f1; f1 = f2) { + f2 = f1->next; + delete f1; + } } - yxStrings = NULL; - xyStrings = NULL; - yxCur1 = yxCur2 = NULL; + deleteGList(fonts, TextFontInfo); + + curWord = NULL; + charPos = 0; + font = NULL; + fontSize = 0; + nest = 0; + nTinyChars = 0; + words = wordPtr = NULL; + lines = NULL; + flows = NULL; + fonts = new GList(); + } + //------------------------------------------------------------------------ // TextOutputDev //------------------------------------------------------------------------ -TextOutputDev::TextOutputDev(char *fileName, GBool useASCII7, GBool rawOrder) { +static void outputToFile(void *stream, char *text, int len) { + fwrite(text, 1, len, (FILE *)stream); +} + +TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA, + GBool rawOrderA, GBool append) { text = NULL; - this->rawOrder = rawOrder; + physLayout = physLayoutA; + rawOrder = rawOrderA; ok = gTrue; // open file needClose = gFalse; if (fileName) { if (!strcmp(fileName, "-")) { - f = stdout; - } else if ((f = fopen(fileName, "w"))) { + outputStream = stdout; +#ifdef WIN32 + // keep DOS from munging the end-of-line characters + setmode(fileno(stdout), O_BINARY); +#endif + } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) { needClose = gTrue; } else { error(-1, "Couldn't open text file '%s'", fileName); ok = gFalse; return; } + outputFunc = &outputToFile; } else { - f = NULL; + outputStream = NULL; } // set up text object - text = new TextPage(useASCII7, rawOrder); + text = new TextPage(rawOrderA); +} + +TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, + GBool physLayoutA, GBool rawOrderA) { + outputFunc = func; + outputStream = stream; + needClose = gFalse; + physLayout = physLayoutA; + rawOrder = rawOrderA; + text = new TextPage(rawOrderA); + ok = gTrue; } TextOutputDev::~TextOutputDev() { - if (needClose) - fclose(f); - if (text) + if (needClose) { +#ifdef MACOS + ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle); +#endif + fclose((FILE *)outputStream); + } + if (text) { delete text; + } } void TextOutputDev::startPage(int pageNum, GfxState *state) { - text->clear(); + text->startPage(state); } void TextOutputDev::endPage() { - text->coalesce(); - if (f) { - text->dump(f); - fputc('\n', f); - fputs("\f\n", f); - fputc('\n', f); + text->coalesce(physLayout); + if (outputStream) { + text->dump(outputStream, outputFunc, physLayout); } } void TextOutputDev::updateFont(GfxState *state) { - GfxFont *font; - char *charName; - int c; - - // look for hex char codes in subsetted font - hexCodes = gFalse; - if ((font = state->getFont()) && !font->is16Bit()) { - for (c = 0; c < 256; ++c) { - if ((charName = font->getCharName(c))) { - if ((charName[0] == 'B' || charName[0] == 'C' || - charName[0] == 'G') && - strlen(charName) == 3 && - ((charName[1] >= 'a' && charName[1] <= 'f') || - (charName[1] >= 'A' && charName[1] <= 'F') || - (charName[2] >= 'a' && charName[2] <= 'f') || - (charName[2] >= 'A' && charName[2] <= 'F'))) { - hexCodes = gTrue; - break; - } - } - } - } + text->updateFont(state); } void TextOutputDev::beginString(GfxState *state, GString *s) { - text->beginString(state, s, hexCodes); + text->beginWord(state, state->getCurX(), state->getCurY()); } void TextOutputDev::endString(GfxState *state) { - text->endString(); + text->endWord(); } void TextOutputDev::drawChar(GfxState *state, double x, double y, - double dx, double dy, Guchar c) { - text->addChar(state, x, y, dx, dy, c); + double dx, double dy, + double originX, double originY, + CharCode c, Unicode *u, int uLen) { + text->addChar(state, x, y, dx, dy, c, u, uLen); } -void TextOutputDev::drawChar16(GfxState *state, double x, double y, - double dx, double dy, int c) { - text->addChar16(state, x, y, dx, dy, c, state->getFont()->getCharSet16()); -} - -GBool TextOutputDev::findText(char *s, GBool top, GBool bottom, +GBool TextOutputDev::findText(Unicode *s, int len, + GBool top, GBool bottom, double *xMin, double *yMin, double *xMax, double *yMax) { - return text->findText(s, top, bottom, xMin, yMin, xMax, yMax); + return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax); +} + +GString *TextOutputDev::getText(double xMin, double yMin, + double xMax, double yMax) { + return text->getText(xMin, yMin, xMax, yMax); } + +GBool TextOutputDev::findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax) { + return text->findCharRange(pos, length, xMin, yMin, xMax, yMax); +} + +