//
// TextOutputDev.cc
//
-// Copyright 1997-2002 Glyph & Cog, LLC
+// Copyright 1997-2003 Glyph & Cog, LLC
//
//========================================================================
-#ifdef __GNUC__
+#include <aconf.h>
+
+#ifdef USE_GCC_PRAGMAS
#pragma implementation
#endif
-#include <aconf.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <math.h>
#include <ctype.h>
-#include "GString.h"
+#ifdef WIN32
+#include <fcntl.h> // for O_BINARY
+#include <io.h> // for setmode
+#endif
#include "gmem.h"
+#include "GString.h"
+#include "GList.h"
#include "config.h"
#include "Error.h"
#include "GlobalParams.h"
#endif
//------------------------------------------------------------------------
-// TextString
+// parameters
+//------------------------------------------------------------------------
+
+// Minium and maximum inter-word spacing (as a fraction of the average
+// character width).
+#define wordMinSpaceWidth 0.3
+#define wordMaxSpaceWidth 2.0
+
+// Default min and max inter-word spacing (when the average character
+// width is unknown).
+#define wordDefMinSpaceWidth 0.2
+#define wordDefMaxSpaceWidth 1.5
+
+// Max difference in x,y coordinates (as a fraction of the font size)
+// allowed for duplicated text (fake boldface, drop shadows) which is
+// to be discarded.
+#define dupMaxDeltaX 0.1
+#define dupMaxDeltaY 0.2
+
+// Min overlap (as a fraction of the font size) required for two
+// lines to be considered vertically overlapping.
+#define lineOverlapSlack 0.5
+
+// Max difference in baseline y coordinates (as a fraction of the font
+// size) allowed for words which are to be grouped into a line, not
+// including sub/superscripts.
+#define lineMaxBaselineDelta 0.1
+
+// Max ratio of font sizes allowed for words which are to be grouped
+// into a line, not including sub/superscripts.
+#define lineMaxFontSizeRatio 1.4
+
+// Min spacing (as a fraction of the font size) allowed between words
+// which are to be grouped into a line.
+#define lineMinDeltaX -0.5
+
+// Minimum vertical overlap (as a fraction of the font size) required
+// for superscript and subscript words.
+#define lineMinSuperscriptOverlap 0.3
+#define lineMinSubscriptOverlap 0.3
+
+// Min/max ratio of font sizes allowed for sub/superscripts compared to
+// the base text.
+#define lineMinSubscriptFontSizeRatio 0.4
+#define lineMaxSubscriptFontSizeRatio 1.01
+#define lineMinSuperscriptFontSizeRatio 0.4
+#define lineMaxSuperscriptFontSizeRatio 1.01
+
+// Max horizontal spacing (as a fraction of the font size) allowed
+// before sub/superscripts.
+#define lineMaxSubscriptDeltaX 0.2
+#define lineMaxSuperscriptDeltaX 0.2
+
+// Maximum vertical spacing (as a fraction of the font size) allowed
+// for lines which are to be grouped into a block.
+#define blkMaxSpacing 2.0
+
+// Max ratio of primary font sizes allowed for lines which are to be
+// grouped into a block.
+#define blkMaxFontSizeRatio 1.3
+
+// Min overlap (as a fraction of the font size) required for two
+// blocks to be considered vertically overlapping.
+#define blkOverlapSlack 0.5
+
+// Max vertical spacing (as a fraction of the font size) allowed
+// between blocks which are 'adjacent' when sorted by reading order.
+#define blkMaxSortSpacing 2.0
+
+// Max vertical offset (as a fraction of the font size) of the top and
+// bottom edges allowed for blocks which are to be grouped into a
+// flow.
+#define flowMaxDeltaY 1.0
+
+//------------------------------------------------------------------------
+// TextFontInfo
+//------------------------------------------------------------------------
+
+TextFontInfo::TextFontInfo(GfxState *state) {
+ double *textMat;
+ double t1, t2, avgWidth, w;
+ int n, i;
+
+ gfxFont = state->getFont();
+ textMat = state->getTextMat();
+ horizScaling = state->getHorizScaling();
+ if ((t1 = fabs(textMat[0])) > 0.01 &&
+ (t2 = fabs(textMat[3])) > 0.01) {
+ horizScaling *= t1 / t2;
+ }
+
+ minSpaceWidth = horizScaling * wordDefMinSpaceWidth;
+ maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth;
+ if (gfxFont && gfxFont->isCIDFont()) {
+ //~ handle 16-bit fonts
+ } else if (gfxFont && gfxFont->getType() != fontType3) {
+ avgWidth = 0;
+ n = 0;
+ for (i = 0; i < 256; ++i) {
+ w = ((Gfx8BitFont *)gfxFont)->getWidth(i);
+ if (w > 0) {
+ avgWidth += w;
+ ++n;
+ }
+ }
+ if (n > 0) {
+ avgWidth /= n;
+ minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth;
+ maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth;
+ }
+ }
+
+}
+
+TextFontInfo::~TextFontInfo() {
+}
+
+GBool TextFontInfo::matches(GfxState *state) {
+ double *textMat;
+ double t1, t2, h;
+
+ textMat = state->getTextMat();
+ h = state->getHorizScaling();
+ if ((t1 = fabs(textMat[0])) > 0.01 &&
+ (t2 = fabs(textMat[3])) > 0.01) {
+ h *= t1 / t2;
+ }
+ return state->getFont() == gfxFont &&
+ fabs(h - horizScaling) < 0.01;
+}
+
+//------------------------------------------------------------------------
+// TextWord
//------------------------------------------------------------------------
-TextString::TextString(GfxState *state, double fontSize) {
- GfxFont *font;
+TextWord::TextWord(GfxState *state, double x0, double y0, int charPosA,
+ TextFontInfo *fontA, double fontSizeA) {
+ GfxFont *gfxFont;
double x, y;
- state->transform(state->getCurX(), state->getCurY(), &x, &y);
- if ((font = state->getFont())) {
- yMin = y - font->getAscent() * fontSize;
- yMax = y - font->getDescent() * fontSize;
+ charPos = charPosA;
+ charLen = 0;
+ font = fontA;
+ fontSize = fontSizeA;
+ state->transform(x0, y0, &x, &y);
+ if ((gfxFont = font->gfxFont)) {
+ yMin = y - gfxFont->getAscent() * fontSize;
+ yMax = y - gfxFont->getDescent() * fontSize;
} else {
// this means that the PDF file draws text without a current font,
// which should never happen
yMin = y;
yMax = y + 1;
}
- col = 0;
+ yBase = y;
text = NULL;
xRight = NULL;
len = size = 0;
- yxNext = NULL;
- xyNext = NULL;
+ spaceAfter = gFalse;
+ next = NULL;
+
}
-TextString::~TextString() {
+
+TextWord::~TextWord() {
gfree(text);
gfree(xRight);
}
-void TextString::addChar(GfxState *state, double x, double y,
- double dx, double dy, Unicode u) {
+void TextWord::addChar(GfxState *state, double x, double y,
+ double dx, double dy, Unicode u) {
if (len == size) {
size += 16;
text = (Unicode *)grealloc(text, size * sizeof(Unicode));
++len;
}
+// Returns true if <this> comes before <word2> in xy order.
+GBool TextWord::xyBefore(TextWord *word2) {
+ return xMin < word2->xMin ||
+ (xMin == word2->xMin && yMin < word2->yMin);
+}
+
+// Merge another word onto the end of this one.
+void TextWord::merge(TextWord *word2) {
+ int i;
+
+ xMax = word2->xMax;
+ if (word2->yMin < yMin) {
+ yMin = word2->yMin;
+ }
+ if (word2->yMax > yMax) {
+ yMax = word2->yMax;
+ }
+ if (len + word2->len > size) {
+ size = len + word2->len;
+ text = (Unicode *)grealloc(text, size * sizeof(Unicode));
+ xRight = (double *)grealloc(xRight, size * sizeof(double));
+ }
+ for (i = 0; i < word2->len; ++i) {
+ text[len + i] = word2->text[i];
+ xRight[len + i] = word2->xRight[i];
+ }
+ len += word2->len;
+ charLen += word2->charLen;
+}
+
+//------------------------------------------------------------------------
+// TextLine
+//------------------------------------------------------------------------
+
+TextLine::TextLine() {
+ words = NULL;
+ text = NULL;
+ xRight = NULL;
+ col = NULL;
+ len = 0;
+ hyphenated = gFalse;
+ pageNext = NULL;
+ next = NULL;
+ flowNext = NULL;
+}
+
+TextLine::~TextLine() {
+ TextWord *w1, *w2;
+
+ for (w1 = words; w1; w1 = w2) {
+ w2 = w1->next;
+ delete w1;
+ }
+ gfree(text);
+ gfree(xRight);
+ gfree(col);
+}
+
+// Returns true if <this> comes before <line2> in yx order, allowing
+// slack for vertically overlapping lines.
+GBool TextLine::yxBefore(TextLine *line2) {
+ double dy;
+
+ dy = lineOverlapSlack * fontSize;
+
+ // non-overlapping case
+ if (line2->yMin > yMax - dy ||
+ line2->yMax < yMin + dy) {
+ return yMin < line2->yMin ||
+ (yMin == line2->yMin && xMin < line2->xMin);
+ }
+
+ // overlapping case
+ return xMin < line2->xMin;
+}
+
+// Merge another line's words onto the end of this line.
+void TextLine::merge(TextLine *line2) {
+ int newLen, i;
+
+ xMax = line2->xMax;
+ if (line2->yMin < yMin) {
+ yMin = line2->yMin;
+ }
+ if (line2->yMax > yMax) {
+ yMax = line2->yMax;
+ }
+ xSpaceR = line2->xSpaceR;
+ lastWord->spaceAfter = gTrue;
+ lastWord->next = line2->words;
+ lastWord = line2->lastWord;
+ line2->words = NULL;
+ newLen = len + 1 + line2->len;
+ text = (Unicode *)grealloc(text, newLen * sizeof(Unicode));
+ xRight = (double *)grealloc(xRight, newLen * sizeof(double));
+ text[len] = (Unicode)0x0020;
+ xRight[len] = line2->xMin;
+ for (i = 0; i < line2->len; ++i) {
+ text[len + 1 + i] = line2->text[i];
+ xRight[len + 1 + i] = line2->xRight[i];
+ }
+ len = newLen;
+ convertedLen += line2->convertedLen;
+ hyphenated = line2->hyphenated;
+}
+
+//------------------------------------------------------------------------
+// TextBlock
+//------------------------------------------------------------------------
+
+TextBlock::TextBlock() {
+ lines = NULL;
+ next = NULL;
+}
+
+TextBlock::~TextBlock() {
+ TextLine *l1, *l2;
+
+ for (l1 = lines; l1; l1 = l2) {
+ l2 = l1->next;
+ delete l1;
+ }
+}
+
+// Returns true if <this> comes before <blk2> in xy order, allowing
+// slack for vertically overlapping blocks.
+GBool TextBlock::yxBefore(TextBlock *blk2) {
+ double dy;
+
+ dy = blkOverlapSlack * lines->fontSize;
+
+ // non-overlapping case
+ if (blk2->yMin > yMax - dy ||
+ blk2->yMax < yMin + dy) {
+ return yMin < blk2->yMin ||
+ (yMin == blk2->yMin && xMin < blk2->xMin);
+ }
+
+ // overlapping case
+ return xMin < blk2->xMin;
+}
+
+// Merge another block's line onto the right of this one.
+void TextBlock::mergeRight(TextBlock *blk2) {
+ lines->merge(blk2->lines);
+ xMax = lines->xMax;
+ yMin = lines->yMin;
+ yMax = lines->yMax;
+ xSpaceR = lines->xSpaceR;
+}
+
+// Merge another block's lines onto the bottom of this block.
+void TextBlock::mergeBelow(TextBlock *blk2) {
+ TextLine *line;
+
+ if (blk2->xMin < xMin) {
+ xMin = blk2->xMin;
+ }
+ if (blk2->xMax > xMax) {
+ xMax = blk2->xMax;
+ }
+ yMax = blk2->yMax;
+ if (blk2->xSpaceL > xSpaceL) {
+ xSpaceL = blk2->xSpaceL;
+ }
+ if (blk2->xSpaceR < xSpaceR) {
+ xSpaceR = blk2->xSpaceR;
+ }
+ if (blk2->maxFontSize > maxFontSize) {
+ maxFontSize = blk2->maxFontSize;
+ }
+ for (line = lines; line->next; line = line->next) ;
+ line->next = line->flowNext = blk2->lines;
+ blk2->lines = NULL;
+}
+
+//------------------------------------------------------------------------
+// TextFlow
+//------------------------------------------------------------------------
+
+TextFlow::TextFlow() {
+ blocks = NULL;
+ next = NULL;
+}
+
+TextFlow::~TextFlow() {
+ TextBlock *b1, *b2;
+
+ for (b1 = blocks; b1; b1 = b2) {
+ b2 = b1->next;
+ delete b1;
+ }
+}
+
+
//------------------------------------------------------------------------
// TextPage
//------------------------------------------------------------------------
TextPage::TextPage(GBool rawOrderA) {
rawOrder = rawOrderA;
- curStr = NULL;
+ curWord = NULL;
+ charPos = 0;
+ font = NULL;
fontSize = 0;
- yxStrings = NULL;
- xyStrings = NULL;
- yxCur1 = yxCur2 = NULL;
nest = 0;
+ nTinyChars = 0;
+ words = wordPtr = NULL;
+ lines = NULL;
+ flows = NULL;
+ fonts = new GList();
}
TextPage::~TextPage() {
clear();
+ delete fonts;
}
void TextPage::updateFont(GfxState *state) {
- GfxFont *font;
+ GfxFont *gfxFont;
double *fm;
char *name;
- int code;
+ int code, mCode, letterCode, anyCode;
double w;
+ int i;
+
+ // get the font info object
+ font = NULL;
+ for (i = 0; i < fonts->getLength(); ++i) {
+ font = (TextFontInfo *)fonts->get(i);
+ if (font->matches(state)) {
+ break;
+ }
+ font = NULL;
+ }
+ if (!font) {
+ font = new TextFontInfo(state);
+ fonts->append(font);
+ }
// adjust the font size
+ gfxFont = state->getFont();
fontSize = state->getTransformedFontSize();
- if ((font = state->getFont()) && font->getType() == fontType3) {
+ if (gfxFont && gfxFont->getType() == fontType3) {
// This is a hack which makes it possible to deal with some Type 3
// fonts. The problem is that it's impossible to know what the
// base coordinate system used in the font is without actually
// rendering the font. This code tries to guess by looking at the
// width of the character 'm' (which breaks if the font is a
// subset that doesn't contain 'm').
+ mCode = letterCode = anyCode = -1;
for (code = 0; code < 256; ++code) {
- if ((name = ((Gfx8BitFont *)font)->getCharName(code)) &&
- name[0] == 'm' && name[1] == '\0') {
- break;
+ name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
+ if (name && name[0] == 'm' && name[1] == '\0') {
+ mCode = code;
}
- }
- if (code < 256) {
- w = ((Gfx8BitFont *)font)->getWidth(code);
- if (w != 0) {
- // 600 is a generic average 'm' width -- yes, this is a hack
- fontSize *= w / 0.6;
+ if (letterCode < 0 && name && name[1] == '\0' &&
+ ((name[0] >= 'A' && name[0] <= 'Z') ||
+ (name[0] >= 'a' && name[0] <= 'z'))) {
+ letterCode = code;
+ }
+ if (anyCode < 0 && name &&
+ ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
+ anyCode = code;
}
}
- fm = font->getFontMatrix();
+ if (mCode >= 0 &&
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
+ // 0.6 is a generic average 'm' width -- yes, this is a hack
+ fontSize *= w / 0.6;
+ } else if (letterCode >= 0 &&
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
+ // even more of a hack: 0.5 is a generic letter width
+ fontSize *= w / 0.5;
+ } else if (anyCode >= 0 &&
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
+ // better than nothing: 0.5 is a generic character width
+ fontSize *= w / 0.5;
+ }
+ fm = gfxFont->getFontMatrix();
if (fm[0] != 0) {
fontSize *= fabs(fm[3] / fm[0]);
}
}
}
-void TextPage::beginString(GfxState *state) {
+void TextPage::beginWord(GfxState *state, double x0, double y0) {
// This check is needed because Type 3 characters can contain
- // text-drawing operations.
- if (curStr) {
+ // text-drawing operations (when TextPage is being used via
+ // XOutputDev rather than TextOutputDev).
+ if (curWord) {
++nest;
return;
}
- curStr = new TextString(state, fontSize);
+ curWord = new TextWord(state, x0, y0, charPos, font, fontSize);
}
void TextPage::addChar(GfxState *state, double x, double y,
- double dx, double dy, Unicode *u, int uLen) {
- double x1, y1, w1, h1, dx2, dy2;
+ double dx, double dy,
+ CharCode c, Unicode *u, int uLen) {
+ double x1, y1, w1, h1, dx2, dy2, sp;
int n, i;
+ // if the previous char was a space, addChar will have called
+ // endWord, so we need to start a new word
+ if (!curWord) {
+ beginWord(state, x, y);
+ }
+
+ // throw away chars that aren't inside the page bounds
state->transform(x, y, &x1, &y1);
- n = curStr->len;
- if (n > 0 &&
- x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) {
- endString();
- beginString(state);
- }
- state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
- 0, &dx2, &dy2);
+ if (x1 < 0 || x1 > pageWidth ||
+ y1 < 0 || y1 > pageHeight) {
+ return;
+ }
+
+ // subtract char and word spacing from the dx,dy values
+ sp = state->getCharSpace();
+ if (c == (CharCode)0x20) {
+ sp += state->getWordSpace();
+ }
+ state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
dx -= dx2;
dy -= dy2;
state->transformDelta(dx, dy, &w1, &h1);
+
+ // check the tiny chars limit
+ if (!globalParams->getTextKeepTinyChars() &&
+ fabs(w1) < 3 && fabs(h1) < 3) {
+ if (++nTinyChars > 20000) {
+ return;
+ }
+ }
+
+ // break words at space character
+ if (uLen == 1 && u[0] == (Unicode)0x20) {
+ ++curWord->charLen;
+ ++charPos;
+ endWord();
+ return;
+ }
+
+ // large char spacing is sometimes used to move text around -- in
+ // this case, break text into individual chars and let the coalesce
+ // function deal with it later
+ n = curWord->len;
+ if (n > 0 && x1 - curWord->xRight[n-1] >
+ curWord->font->minSpaceWidth * curWord->fontSize) {
+ endWord();
+ beginWord(state, x, y);
+ }
+
+ // page rotation and/or transform matrices can cause text to be
+ // drawn in reverse order -- in this case, swap the begin/end
+ // coordinates and break text into individual chars
+ if (w1 < 0) {
+ endWord();
+ beginWord(state, x + dx, y + dy);
+ x1 += w1;
+ y1 += h1;
+ w1 = -w1;
+ h1 = -h1;
+ }
+
+ // add the characters to the current word
if (uLen != 0) {
w1 /= uLen;
h1 /= uLen;
}
for (i = 0; i < uLen; ++i) {
- curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
}
+ ++curWord->charLen;
+ ++charPos;
}
-void TextPage::endString() {
- TextString *p1, *p2;
- double h, y1, y2;
-
+void TextPage::endWord() {
// This check is needed because Type 3 characters can contain
- // text-drawing operations.
+ // text-drawing operations (when TextPage is being used via
+ // XOutputDev rather than TextOutputDev).
if (nest > 0) {
--nest;
return;
}
- // throw away zero-length strings -- they don't have valid xMin/xMax
+ if (curWord) {
+ addWord(curWord);
+ curWord = NULL;
+ }
+}
+
+void TextPage::addWord(TextWord *word) {
+ TextWord *p1, *p2;
+
+ // throw away zero-length words -- they don't have valid xMin/xMax
// values, and they're useless anyway
- if (curStr->len == 0) {
- delete curStr;
- curStr = NULL;
+ if (word->len == 0) {
+ delete word;
return;
}
- // insert string in y-major list
- h = curStr->yMax - curStr->yMin;
- y1 = curStr->yMin + 0.5 * h;
- y2 = curStr->yMin + 0.8 * h;
+ // insert word in xy list
if (rawOrder) {
- p1 = yxCur1;
+ p1 = wordPtr;
p2 = NULL;
- } else if ((!yxCur1 ||
- (y1 >= yxCur1->yMin &&
- (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) &&
- (!yxCur2 ||
- (y1 < yxCur2->yMin ||
- (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) {
- p1 = yxCur1;
- p2 = yxCur2;
} else {
- for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) {
- if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin)) {
+ if (wordPtr && wordPtr->xyBefore(word)) {
+ p1 = wordPtr;
+ p2 = wordPtr->next;
+ } else {
+ p1 = NULL;
+ p2 = words;
+ }
+ for (; p2; p1 = p2, p2 = p2->next) {
+ if (word->xyBefore(p2)) {
break;
}
}
- yxCur2 = p2;
}
- yxCur1 = curStr;
if (p1) {
- p1->yxNext = curStr;
+ p1->next = word;
} else {
- yxStrings = curStr;
+ words = word;
}
- curStr->yxNext = p2;
- curStr = NULL;
+ word->next = p2;
+ wordPtr = word;
}
-void TextPage::coalesce() {
- TextString *str1, *str2;
- double space, d;
- GBool addSpace;
- int n, i;
+void TextPage::coalesce(GBool physLayout) {
+ TextWord *word0, *word1, *word2;
+ TextLine *line0, *line1, *line2, *line3, *line4, *lineList;
+ TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6;
+ TextBlock *yxBlocks, *blocks, *blkStack;
+ TextFlow *flow0, *flow1;
+ double sz, xLimit, yLimit;
+ double fit1, fit2, sp1, sp2;
+ GBool found;
+ UnicodeMap *uMap;
+ GBool isUnicode;
+ char buf[8];
+ int col1, col2, d, i, j;
+
+#if 0 // for debugging
+ printf("*** initial word list ***\n");
+ for (word0 = words; word0; word0 = word0->next) {
+ printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
-#if 0 //~ for debugging
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
- printf("x=%3d..%3d y=%3d..%3d size=%2d '",
- (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax,
- (int)(str1->yMax - str1->yMin));
- for (i = 0; i < str1->len; ++i) {
- fputc(str1->text[i] & 0xff, stdout);
+ //----- discard duplicated text (fake boldface, drop shadows)
+
+ word0 = words;
+ while (word0) {
+ sz = word0->fontSize;
+ xLimit = word0->xMin + sz * dupMaxDeltaX;
+ found = gFalse;
+ for (word1 = word0, word2 = word0->next;
+ word2 && word2->xMin < xLimit;
+ word1 = word2, word2 = word2->next) {
+ if (word2->len == word0->len &&
+ !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) &&
+ fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY &&
+ fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY &&
+ fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) {
+ found = gTrue;
+ break;
+ }
+ }
+ if (found) {
+ word1->next = word2->next;
+ delete word2;
+ } else {
+ word0 = word0->next;
+ }
+ }
+
+#if 0 // for debugging
+ printf("*** words after removing duplicate text ***\n");
+ for (word0 = words; word0; word0 = word0->next) {
+ printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
}
printf("'\n");
}
- printf("\n------------------------------------------------------------\n\n");
+ printf("\n");
+ fflush(stdout);
#endif
- str1 = yxStrings;
- while (str1 && (str2 = str1->yxNext)) {
- space = str1->yMax - str1->yMin;
- d = str2->xMin - str1->xMax;
- if (((rawOrder &&
- ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) ||
- (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) ||
- (!rawOrder && str2->yMin < str1->yMax)) &&
- d > -0.5 * space && d < space) {
- n = str1->len + str2->len;
- if ((addSpace = d > 0.1 * space)) {
- ++n;
+
+ //----- merge words
+
+ word0 = words;
+ while (word0) {
+ sz = word0->fontSize;
+
+ // look for adjacent text which is part of the same word, and
+ // merge it into this word
+ xLimit = word0->xMax + sz * word0->font->minSpaceWidth;
+ if (rawOrder) {
+ word1 = word0;
+ word2 = word0->next;
+ found = word2 &&
+ word2->xMin < xLimit &&
+ word2->font == word0->font &&
+ fabs(word2->fontSize - sz) < 0.05 &&
+ fabs(word2->yBase - word0->yBase) < 0.05 &&
+ word2->charPos == word0->charPos + word0->charLen;
+ } else {
+ found = gFalse;
+ for (word1 = word0, word2 = word0->next;
+ word2 && word2->xMin < xLimit;
+ word1 = word2, word2 = word2->next) {
+ if (word2->font == word0->font &&
+ fabs(word2->fontSize - sz) < 0.05 &&
+ fabs(word2->yBase - word0->yBase) < 0.05 &&
+ word2->charPos == word0->charPos + word0->charLen) {
+ found = gTrue;
+ break;
+ }
+ }
+ }
+ if (found) {
+ word0->merge(word2);
+ word1->next = word2->next;
+ delete word2;
+ continue;
+ }
+
+ word0 = word0->next;
+ }
+
+#if 0 // for debugging
+ printf("*** after merging words ***\n");
+ for (word0 = words; word0; word0 = word0->next) {
+ printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- assemble words into lines
+
+ lineList = line0 = NULL;
+ while (words) {
+
+ // remove the first word from the word list
+ word0 = words;
+ words = words->next;
+ word0->next = NULL;
+
+ // find the best line (if any) for the word
+ if (rawOrder) {
+ if (line0 && lineFit(line0, word0, &sp2) >= 0) {
+ line1 = line0;
+ sp1 = sp2;
+ } else {
+ line1 = NULL;
+ sp1 = 0;
+ }
+ } else {
+ line1 = NULL;
+ fit1 = 0;
+ sp1 = 0;
+ for (line2 = lineList; line2; line2 = line2->next) {
+ fit2 = lineFit(line2, word0, &sp2);
+ if (fit2 >= 0 && (!line1 || fit2 < fit1)) {
+ line1 = line2;
+ fit1 = fit2;
+ sp1 = sp2;
+ }
+ }
+ }
+
+ // found a line: append the word
+ if (line1) {
+ word1 = line1->lastWord;
+ word1->next = word0;
+ line1->lastWord = word0;
+ if (word0->xMax > line1->xMax) {
+ line1->xMax = word0->xMax;
+ }
+ if (word0->yMin < line1->yMin) {
+ line1->yMin = word0->yMin;
+ }
+ if (word0->yMax > line1->yMax) {
+ line1->yMax = word0->yMax;
+ }
+ line1->len += word0->len;
+ if (sp1 > line1->fontSize * line1->font->minSpaceWidth) {
+ word1->spaceAfter = gTrue;
+ ++line1->len;
+ }
+
+ // didn't find a line: create a new line
+ } else {
+ line1 = new TextLine();
+ line1->words = line1->lastWord = word0;
+ line1->xMin = word0->xMin;
+ line1->xMax = word0->xMax;
+ line1->yMin = word0->yMin;
+ line1->yMax = word0->yMax;
+ line1->yBase = word0->yBase;
+ line1->font = word0->font;
+ line1->fontSize = word0->fontSize;
+ line1->len = word0->len;
+ if (line0) {
+ line0->next = line1;
+ } else {
+ lineList = line1;
+ }
+ line0 = line1;
+ }
+ }
+
+ // build the line text
+ uMap = globalParams->getTextEncoding();
+ isUnicode = uMap ? uMap->isUnicode() : gFalse;
+
+ for (line1 = lineList; line1; line1 = line1->next) {
+ line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode));
+ line1->xRight = (double *)gmalloc(line1->len * sizeof(double));
+ line1->col = (int *)gmalloc(line1->len * sizeof(int));
+ i = 0;
+ for (word1 = line1->words; word1; word1 = word1->next) {
+ for (j = 0; j < word1->len; ++j) {
+ line1->text[i] = word1->text[j];
+ line1->xRight[i] = word1->xRight[j];
+ ++i;
+ }
+ if (word1->spaceAfter && word1->next) {
+ line1->text[i] = (Unicode)0x0020;
+ line1->xRight[i] = word1->next->xMin;
+ ++i;
+ }
+ }
+ line1->convertedLen = 0;
+ for (j = 0; j < line1->len; ++j) {
+ line1->col[j] = line1->convertedLen;
+ if (isUnicode) {
+ ++line1->convertedLen;
+ } else if (uMap) {
+ line1->convertedLen +=
+ uMap->mapUnicode(line1->text[j], buf, sizeof(buf));
+ }
+ }
+
+ // check for hyphen at end of line
+ //~ need to check for other chars used as hyphens
+ if (line1->text[line1->len - 1] == (Unicode)'-') {
+ line1->hyphenated = gTrue;
+ }
+
+ }
+
+ if (uMap) {
+ uMap->decRefCnt();
+ }
+
+#if 0 // for debugging
+ printf("*** lines in xy order ***\n");
+ for (line0 = lineList; line0; line0 = line0->next) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->fontSize, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- column assignment
+
+ for (line1 = lineList; line1; line1 = line1->next) {
+ col1 = 0;
+ for (line2 = lineList; line2 != line1; line2 = line2->next) {
+ if (line1->xMin >= line2->xMax) {
+ d = (int)((line1->xMin - line2->xMax) /
+ (line1->font->maxSpaceWidth * line1->fontSize));
+ if (d > 4) {
+ d = 4;
+ }
+ col2 = line2->col[0] + line2->convertedLen + d;
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ } else if (line1->xMin > line2->xMin) {
+ for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ;
+ col2 = line2->col[i];
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ }
+ }
+ for (j = 0; j < line1->len; ++j) {
+ line1->col[j] += col1;
+ }
+ }
+
+#if 0 // for debugging
+ printf("*** lines in xy order, after column assignment ***\n");
+ for (line0 = lineList; line0; line0 = line0->next) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f col=%d len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->col[0], line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->fontSize, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- assemble lines into blocks
+
+ if (rawOrder) {
+
+ lines = lineList;
+ for (line1 = lines; line1; line1 = line1->next) {
+ line1->xSpaceL = 0;
+ line1->xSpaceR = pageWidth;
+ }
+
+ } else {
+
+ // sort lines into yx order
+ lines = NULL;
+ while (lineList) {
+ line0 = lineList;
+ lineList = lineList->next;
+ for (line1 = NULL, line2 = lines;
+ line2 && !line0->yxBefore(line2);
+ line1 = line2, line2 = line2->next) ;
+ if (line1) {
+ line1->next = line0;
+ } else {
+ lines = line0;
+ }
+ line0->next = line2;
+ }
+
+ // compute whitespace to left and right of each line
+ line0 = lines;
+ for (line1 = lines; line1; line1 = line1->next) {
+
+ // find the first vertically overlapping line
+ for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ;
+
+ // check each vertically overlapping line -- look for the nearest
+ // on each side
+ line1->xSpaceL = 0;
+ line1->xSpaceR = pageWidth;
+ for (line2 = line0;
+ line2 && line2->yMin < line1->yMax;
+ line2 = line2->next) {
+ if (line2->yMax > line1->yMin) {
+ if (line2->xMax < line1->xMin) {
+ if (line2->xMax > line1->xSpaceL) {
+ line1->xSpaceL = line2->xMax;
+ }
+ } else if (line2->xMin > line1->xMax) {
+ if (line2->xMin < line1->xSpaceR) {
+ line1->xSpaceR = line2->xMin;
+ }
+ }
+ }
+ }
+ }
+ } // (!rawOrder)
+
+#if 0 // for debugging
+ printf("*** lines in yx order ***\n");
+ for (line0 = lines; line0; line0 = line0->next) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->fontSize, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ lineList = lines;
+ yxBlocks = NULL;
+ blk0 = NULL;
+ while (lineList) {
+
+ // build a new block object
+ line0 = lineList;
+ lineList = lineList->next;
+ line0->next = NULL;
+ blk1 = new TextBlock();
+ blk1->lines = line0;
+ blk1->xMin = line0->xMin;
+ blk1->xMax = line0->xMax;
+ blk1->yMin = line0->yMin;
+ blk1->yMax = line0->yMax;
+ blk1->xSpaceL = line0->xSpaceL;
+ blk1->xSpaceR = line0->xSpaceR;
+ blk1->maxFontSize = line0->fontSize;
+
+ // find subsequent lines in the block
+ while (lineList) {
+
+ // look for the first horizontally overlapping line below this
+ // one
+ yLimit = line0->yMax + blkMaxSpacing * line0->fontSize;
+ line3 = line4 = NULL;
+ if (rawOrder) {
+ if (lineList->yMin < yLimit &&
+ lineList->xMax > blk1->xMin &&
+ lineList->xMin < blk1->xMax) {
+ line3 = NULL;
+ line4 = lineList;
+ }
+ } else {
+ for (line1 = NULL, line2 = lineList;
+ line2 && line2->yMin < yLimit;
+ line1 = line2, line2 = line2->next) {
+ if (line2->xMax > blk1->xMin &&
+ line2->xMin < blk1->xMax) {
+ line3 = line1;
+ line4 = line2;
+ break;
+ }
+ }
+ }
+
+ // if there is an overlapping line and it fits in the block, add
+ // it to the block
+ if (line4 && blockFit(blk1, line4)) {
+ if (line3) {
+ line3->next = line4->next;
+ } else {
+ lineList = line4->next;
+ }
+ line0->next = line0->flowNext = line4;
+ line4->next = NULL;
+ if (line4->xMin < blk1->xMin) {
+ blk1->xMin = line4->xMin;
+ } else if (line4->xMax > blk1->xMax) {
+ blk1->xMax = line4->xMax;
+ }
+ if (line4->yMax > blk1->yMax) {
+ blk1->yMax = line4->yMax;
+ }
+ if (line4->xSpaceL > blk1->xSpaceL) {
+ blk1->xSpaceL = line4->xSpaceL;
+ }
+ if (line4->xSpaceR < blk1->xSpaceR) {
+ blk1->xSpaceR = line4->xSpaceR;
+ }
+ if (line4->fontSize > blk1->maxFontSize) {
+ blk1->maxFontSize = line4->fontSize;
+ }
+ line0 = line4;
+
+ // otherwise, we're done with this block
+ } else {
+ break;
}
- str1->size = (n + 15) & ~15;
- str1->text = (Unicode *)grealloc(str1->text,
- str1->size * sizeof(Unicode));
- str1->xRight = (double *)grealloc(str1->xRight,
- str1->size * sizeof(double));
- if (addSpace) {
- str1->text[str1->len] = 0x20;
- str1->xRight[str1->len] = str2->xMin;
- ++str1->len;
- }
- for (i = 0; i < str2->len; ++i) {
- str1->text[str1->len] = str2->text[i];
- str1->xRight[str1->len] = str2->xRight[i];
- ++str1->len;
- }
- if (str2->xMax > str1->xMax) {
- str1->xMax = str2->xMax;
- }
- if (str2->yMax > str1->yMax) {
- str1->yMax = str2->yMax;
- }
- str1->yxNext = str2->yxNext;
- delete str2;
+ }
+
+ // insert block on list, in yx order
+ if (rawOrder) {
+ blk2 = blk0;
+ blk3 = NULL;
+ blk0 = blk1;
} else {
- str1 = str2;
+ for (blk2 = NULL, blk3 = yxBlocks;
+ blk3 && !blk1->yxBefore(blk3);
+ blk2 = blk3, blk3 = blk3->next) ;
+ }
+ blk1->next = blk3;
+ if (blk2) {
+ blk2->next = blk1;
+ } else {
+ yxBlocks = blk1;
+ }
+ }
+
+#if 0 // for debugging
+ printf("*** blocks in yx order ***\n");
+ for (blk0 = yxBlocks; blk0; blk0 = blk0->next) {
+ printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
+ blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
+ for (line0 = blk0->lines; line0; line0 = line0->next) {
+ printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- merge lines and blocks, sort blocks into reading order
+
+ if (rawOrder) {
+ blocks = yxBlocks;
+
+ } else {
+ blocks = NULL;
+ blk0 = NULL;
+ blkStack = NULL;
+ while (yxBlocks) {
+
+ // find the next two blocks:
+ // - if the depth-first traversal stack is empty, take the first
+ // (upper-left-most) two blocks on the yx-sorted block list
+ // - otherwise, find the two upper-left-most blocks under the top
+ // block on the stack
+ if (blkStack) {
+ blk3 = blk4 = blk5 = blk6 = NULL;
+ for (blk1 = NULL, blk2 = yxBlocks;
+ blk2;
+ blk1 = blk2, blk2 = blk2->next) {
+ if (blk2->yMin > blkStack->yMin &&
+ blk2->xMax > blkStack->xMin &&
+ blk2->xMin < blkStack->xMax) {
+ if (!blk4 || blk2->yxBefore(blk4)) {
+ blk5 = blk3;
+ blk6 = blk4;
+ blk3 = blk1;
+ blk4 = blk2;
+ } else if (!blk6 || blk2->yxBefore(blk6)) {
+ blk5 = blk1;
+ blk6 = blk2;
+ }
+ }
+ }
+ } else {
+ blk3 = NULL;
+ blk4 = yxBlocks;
+ blk5 = yxBlocks;
+ blk6 = yxBlocks->next;
+ }
+
+ // merge case 1:
+ // | | |
+ // | blkStack | | blkStack
+ // +---------------------+ --> +--------------
+ // +------+ +------+ +-----------+
+ // | blk4 | | blk6 | ... | blk4+blk6 |
+ // +------+ +------+ +-----------+
+ yLimit = 0; // make gcc happy
+ if (blkStack) {
+ yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize;
+ }
+ if (blkStack && blk4 && blk6 &&
+ !blk4->lines->next && !blk6->lines->next &&
+ lineFit2(blk4->lines, blk6->lines) &&
+ blk4->yMin < yLimit &&
+ blk4->xMin > blkStack->xSpaceL &&
+ blkStack->xMin > blk4->xSpaceL &&
+ blk6->xMax < blkStack->xSpaceR) {
+ blk4->mergeRight(blk6);
+ if (blk5) {
+ blk5->next = blk6->next;
+ } else {
+ yxBlocks = blk6->next;
+ }
+ delete blk6;
+
+ // merge case 2:
+ // | | | |
+ // | blkStack | | |
+ // +---------------------+ --> | blkStack+blk2 |
+ // +---------------------+ | |
+ // | blk4 | | |
+ // | | | |
+ } else if (blkStack && blk4 &&
+ blk4->yMin < yLimit &&
+ blockFit2(blkStack, blk4)) {
+ blkStack->mergeBelow(blk4);
+ if (blk3) {
+ blk3->next = blk4->next;
+ } else {
+ yxBlocks = blk4->next;
+ }
+ delete blk4;
+
+ // if any of:
+ // 1. no block found
+ // 2. non-fully overlapping block found
+ // 3. large vertical gap above the overlapping block
+ // then pop the stack and try again
+ } else if (!blk4 ||
+ (blkStack && (blk4->xMin < blkStack->xSpaceL ||
+ blk4->xMax > blkStack->xSpaceR ||
+ blk4->yMin - blkStack->yMax >
+ blkMaxSortSpacing * blkStack->maxFontSize))) {
+ blkStack = blkStack->stackNext;
+
+ // add a block to the sorted list
+ } else {
+
+ // remove the block from the yx-sorted list
+ if (blk3) {
+ blk3->next = blk4->next;
+ } else {
+ yxBlocks = blk4->next;
+ }
+ blk4->next = NULL;
+
+ // append the block to the reading-order list
+ if (blk0) {
+ blk0->next = blk4;
+ } else {
+ blocks = blk4;
+ }
+ blk0 = blk4;
+
+ // push the block on the traversal stack
+ if (!physLayout) {
+ blk4->stackNext = blkStack;
+ blkStack = blk4;
+ }
+ }
+ }
+ } // (!rawOrder)
+
+#if 0 // for debugging
+ printf("*** blocks in reading order (after merging) ***\n");
+ for (blk0 = blocks; blk0; blk0 = blk0->next) {
+ printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
+ blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
+ for (line0 = blk0->lines; line0; line0 = line0->next) {
+ printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- assemble blocks into flows
+
+ if (rawOrder) {
+
+ // one flow per block
+ flow0 = NULL;
+ while (blocks) {
+ flow1 = new TextFlow();
+ flow1->blocks = blocks;
+ flow1->lines = blocks->lines;
+ flow1->yMin = blocks->yMin;
+ flow1->yMax = blocks->yMax;
+ blocks = blocks->next;
+ flow1->blocks->next = NULL;
+ if (flow0) {
+ flow0->next = flow1;
+ } else {
+ flows = flow1;
+ }
+ flow0 = flow1;
+ }
+
+ } else {
+
+ // compute whitespace above and below each block
+ for (blk0 = blocks; blk0; blk0 = blk0->next) {
+ blk0->ySpaceT = 0;
+ blk0->ySpaceB = pageHeight;
+
+ // check each horizontally overlapping block
+ for (blk1 = blocks; blk1; blk1 = blk1->next) {
+ if (blk1 != blk0 &&
+ blk1->xMin < blk0->xMax &&
+ blk1->xMax > blk0->xMin) {
+ if (blk1->yMax < blk0->yMin) {
+ if (blk1->yMax > blk0->ySpaceT) {
+ blk0->ySpaceT = blk1->yMax;
+ }
+ } else if (blk1->yMin > blk0->yMax) {
+ if (blk1->yMin < blk0->ySpaceB) {
+ blk0->ySpaceB = blk1->yMin;
+ }
+ }
+ }
+ }
+ }
+
+ flow0 = NULL;
+ while (blocks) {
+
+ // build a new flow object
+ flow1 = new TextFlow();
+ flow1->blocks = blocks;
+ flow1->lines = blocks->lines;
+ flow1->yMin = blocks->yMin;
+ flow1->yMax = blocks->yMax;
+ flow1->ySpaceT = blocks->ySpaceT;
+ flow1->ySpaceB = blocks->ySpaceB;
+
+ // find subsequent blocks in the flow
+ for (blk1 = blocks, blk2 = blocks->next;
+ blk2 && flowFit(flow1, blk2);
+ blk1 = blk2, blk2 = blk2->next) {
+ if (blk2->yMin < flow1->yMin) {
+ flow1->yMin = blk2->yMin;
+ }
+ if (blk2->yMax > flow1->yMax) {
+ flow1->yMax = blk2->yMax;
+ }
+ if (blk2->ySpaceT > flow1->ySpaceT) {
+ flow1->ySpaceT = blk2->ySpaceT;
+ }
+ if (blk2->ySpaceB < flow1->ySpaceB) {
+ flow1->ySpaceB = blk2->ySpaceB;
+ }
+ for (line1 = blk1->lines; line1->next; line1 = line1->next) ;
+ line1->flowNext = blk2->lines;
+ }
+
+ // chop the block list
+ blocks = blk1->next;
+ blk1->next = NULL;
+
+ // append the flow to the list
+ if (flow0) {
+ flow0->next = flow1;
+ } else {
+ flows = flow1;
+ }
+ flow0 = flow1;
}
}
+
+#if 0 // for debugging
+ printf("*** flows ***\n");
+ for (flow0 = flows; flow0; flow0 = flow0->next) {
+ printf("[flow]\n");
+ for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) {
+ printf(" [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n",
+ blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax,
+ blk0->ySpaceT, blk0->ySpaceB);
+ for (line0 = blk0->lines; line0; line0 = line0->next) {
+ printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- sort lines into yx order
+
+ // (the block/line merging process doesn't maintain the full-page
+ // linked list of lines)
+
+ lines = NULL;
+ if (rawOrder) {
+ line0 = NULL;
+ for (flow0 = flows; flow0; flow0 = flow0->next) {
+ for (line1 = flow0->lines; line1; line1 = line1->flowNext) {
+ if (line0) {
+ line0->pageNext = line1;
+ } else {
+ lines = line1;
+ }
+ line0 = line1;
+ }
+ }
+ } else {
+ for (flow0 = flows; flow0; flow0 = flow0->next) {
+ for (line0 = flow0->lines; line0; line0 = line0->flowNext) {
+ for (line1 = NULL, line2 = lines;
+ line2 && !line0->yxBefore(line2);
+ line1 = line2, line2 = line2->pageNext) ;
+ if (line1) {
+ line1->pageNext = line0;
+ } else {
+ lines = line0;
+ }
+ line0->pageNext = line2;
+ }
+ }
+ }
+
+#if 0 // for debugging
+ printf("*** lines in yx order ***\n");
+ for (line0 = lines; line0; line0 = line0->pageNext) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0],
+ line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+}
+
+// If <word> can be added the end of <line>, return the absolute value
+// of the difference between <line>'s baseline and <word>'s baseline,
+// and set *<space> to the horizontal space between the current last
+// word in <line> and <word>. A smaller return value indicates a
+// better fit. Otherwise, return a negative number.
+double TextPage::lineFit(TextLine *line, TextWord *word, double *space) {
+ TextWord *lastWord;
+ double fontSize0, fontSize1;
+ double dx, dxLimit;
+
+ lastWord = line->lastWord;
+ fontSize0 = line->fontSize;
+ fontSize1 = word->fontSize;
+ dx = word->xMin - lastWord->xMax;
+ dxLimit = fontSize0 * lastWord->font->maxSpaceWidth;
+
+ // check inter-word spacing
+ if (dx < fontSize0 * lineMinDeltaX ||
+ dx > dxLimit) {
+ return -1;
+ }
+
+ if (
+ // look for adjacent words with close baselines and close font sizes
+ (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 &&
+ fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
+ fontSize1 < lineMaxFontSizeRatio * fontSize0) ||
+
+ // look for a superscript
+ (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 &&
+ fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 &&
+ (word->yMax < lastWord->yMax ||
+ word->yBase < lastWord->yBase) &&
+ word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 &&
+ dx < fontSize0 * lineMaxSuperscriptDeltaX) ||
+
+ // look for a subscript
+ (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 &&
+ fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 &&
+ (word->yMin > lastWord->yMin ||
+ word->yBase > lastWord->yBase) &&
+ line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 &&
+ dx < fontSize0 * lineMaxSubscriptDeltaX)) {
+
+ *space = dx;
+ return fabs(word->yBase - line->yBase);
+ }
+
+ return -1;
}
+// Returns true if <line0> and <line1> can be merged into a single
+// line, ignoring max word spacing.
+GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) {
+ double fontSize0, fontSize1;
+ double dx;
+
+ fontSize0 = line0->fontSize;
+ fontSize1 = line1->fontSize;
+ dx = line1->xMin - line0->xMax;
+
+ // check inter-word spacing
+ if (dx < fontSize0 * lineMinDeltaX) {
+ return gFalse;
+ }
+
+ // look for close baselines and close font sizes
+ if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 &&
+ fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
+ fontSize1 < lineMaxFontSizeRatio * fontSize0) {
+ return gTrue;
+ }
+
+ return gFalse;
+}
+
+// Returns true if <line> can be added to <blk>. Assumes the y
+// coordinates are within range.
+GBool TextPage::blockFit(TextBlock *blk, TextLine *line) {
+ double fontSize0, fontSize1;
+
+ // check edges
+ if (line->xMin < blk->xSpaceL ||
+ line->xMax > blk->xSpaceR ||
+ blk->xMin < line->xSpaceL ||
+ blk->xMax > line->xSpaceR) {
+ return gFalse;
+ }
+
+ // check font sizes
+ fontSize0 = blk->lines->fontSize;
+ fontSize1 = line->fontSize;
+ if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
+ fontSize1 > blkMaxFontSizeRatio * fontSize0) {
+ return gFalse;
+ }
+
+ return gTrue;
+}
+
+// Returns true if <blk0> and <blk1> can be merged into a single
+// block. Assumes the y coordinates are within range.
+GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) {
+ double fontSize0, fontSize1;
+
+ // check edges
+ if (blk1->xMin < blk0->xSpaceL ||
+ blk1->xMax > blk0->xSpaceR ||
+ blk0->xMin < blk1->xSpaceL ||
+ blk0->xMax > blk1->xSpaceR) {
+ return gFalse;
+ }
+
+ // check font sizes
+ fontSize0 = blk0->lines->fontSize;
+ fontSize1 = blk1->lines->fontSize;
+ if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
+ fontSize1 > blkMaxFontSizeRatio * fontSize0) {
+ return gFalse;
+ }
+
+ return gTrue;
+}
+
+// Returns true if <blk> can be added to <flow>.
+GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) {
+ double dy;
+
+ // check whitespace above and below
+ if (blk->yMin < flow->ySpaceT ||
+ blk->yMax > flow->ySpaceB ||
+ flow->yMin < blk->ySpaceT ||
+ flow->yMax > blk->ySpaceB) {
+ return gFalse;
+ }
+
+ // check that block top edge is within +/- dy of flow top edge,
+ // and that block bottom edge is above flow bottom edge + dy
+ dy = flowMaxDeltaY * flow->blocks->maxFontSize;
+ return blk->yMin > flow->yMin - dy &&
+ blk->yMin < flow->yMin + dy &&
+ blk->yMax < flow->yMax + dy;
+}
+
+
GBool TextPage::findText(Unicode *s, int len,
GBool top, GBool bottom,
double *xMin, double *yMin,
double *xMax, double *yMax) {
- TextString *str;
+ TextLine *line;
Unicode *p;
Unicode u1, u2;
int m, i, j;
- double x;
+ double x0, x1, x;
- // scan all strings on page
- for (str = yxStrings; str; str = str->yxNext) {
+ // scan all text on the page
+ for (line = lines; line; line = line->pageNext) {
// check: above top limit?
- if (!top && (str->yMax < *yMin ||
- (str->yMin < *yMin && str->xMax <= *xMin))) {
+ if (!top && (line->yMax < *yMin ||
+ (line->yMin < *yMin && line->xMax <= *xMin))) {
continue;
}
// check: below bottom limit?
- if (!bottom && (str->yMin > *yMax ||
- (str->yMax > *yMax && str->xMin >= *xMax))) {
+ if (!bottom && (line->yMin > *yMax ||
+ (line->yMax > *yMax && line->xMin >= *xMax))) {
return gFalse;
}
- // search each position in this string
- m = str->len;
- for (i = 0, p = str->text; i <= m - len; ++i, ++p) {
+ // search each position in this line
+ m = line->len;
+ for (i = 0, p = line->text; i <= m - len; ++i, ++p) {
+
+ x0 = (i == 0) ? line->xMin : line->xRight[i-1];
+ x1 = line->xRight[i];
+ x = 0.5 * (x0 + x1);
// check: above top limit?
- if (!top && str->yMin < *yMin) {
- x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2;
+ if (!top && line->yMin < *yMin) {
if (x < *xMin) {
continue;
}
}
// check: below bottom limit?
- if (!bottom && str->yMax > *yMax) {
- x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2;
+ if (!bottom && line->yMax > *yMax) {
if (x > *xMax) {
return gFalse;
}
// found it
if (j == len) {
- *xMin = (i == 0) ? str->xMin : str->xRight[i-1];
- *xMax = str->xRight[i + len - 1];
- *yMin = str->yMin;
- *yMax = str->yMax;
+ *xMin = x0;
+ *xMax = line->xRight[i + len - 1];
+ *yMin = line->yMin;
+ *yMax = line->yMax;
return gTrue;
}
}
}
+
return gFalse;
}
double xMax, double yMax) {
GString *s;
UnicodeMap *uMap;
+ GBool isUnicode;
char space[8], eol[16], buf[8];
- int spaceLen, eolLen, n;
- TextString *str1;
- double x0, x1, x2, y;
- double xPrev, yPrev;
- int i1, i2, i;
+ int spaceLen, eolLen, len;
+ TextLine *line, *prevLine;
+ double x0, x1, y;
+ int firstCol, col, i;
GBool multiLine;
s = new GString();
+
+ // get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
return s;
}
+ isUnicode = uMap->isUnicode();
spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
eolLen = 0; // make gcc happy
switch (globalParams->getTextEOL()) {
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
break;
}
- xPrev = yPrev = 0;
- multiLine = gFalse;
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
- y = 0.5 * (str1->yMin + str1->yMax);
- if (y > yMax) {
+
+ // find the leftmost column
+ firstCol = -1;
+ for (line = lines; line; line = line->pageNext) {
+ if (line->yMin > yMax) {
break;
}
- if (y > yMin && str1->xMin < xMax && str1->xMax > xMin) {
- x0 = x1 = x2 = str1->xMin;
- for (i1 = 0; i1 < str1->len; ++i1) {
- x0 = (i1==0) ? str1->xMin : str1->xRight[i1-1];
- x1 = str1->xRight[i1];
- if (0.5 * (x0 + x1) >= xMin) {
- break;
- }
- }
- for (i2 = str1->len - 1; i2 > i1; --i2) {
- x1 = (i2==0) ? str1->xMin : str1->xRight[i2-1];
- x2 = str1->xRight[i2];
- if (0.5 * (x1 + x2) <= xMax) {
- break;
- }
+ if (line->yMax < yMin ||
+ line->xMax < xMin ||
+ line->xMin > xMax) {
+ continue;
+ }
+
+ y = 0.5 * (line->yMin + line->yMax);
+ if (y < yMin || y > yMax) {
+ continue;
+ }
+
+ i = 0;
+ while (i < line->len) {
+ x0 = (i==0) ? line->xMin : line->xRight[i-1];
+ x1 = line->xRight[i];
+ if (0.5 * (x0 + x1) > xMin) {
+ break;
}
- if (s->getLength() > 0) {
- if (x0 < xPrev || str1->yMin > yPrev) {
- s->append(eol, eolLen);
- multiLine = gTrue;
- } else {
- for (i = 0; i < 4; ++i) {
- s->append(space, spaceLen);
- }
- }
+ ++i;
+ }
+ if (i == line->len) {
+ continue;
+ }
+ col = line->col[i];
+
+ if (firstCol < 0 || col < firstCol) {
+ firstCol = col;
+ }
+ }
+
+ // extract the text
+ col = firstCol;
+ multiLine = gFalse;
+ prevLine = NULL;
+ for (line = lines; line; line = line->pageNext) {
+ if (line->yMin > yMax) {
+ break;
+ }
+ if (line->yMax < yMin ||
+ line->xMax < xMin ||
+ line->xMin > xMax) {
+ continue;
+ }
+
+ y = 0.5 * (line->yMin + line->yMax);
+ if (y < yMin || y > yMax) {
+ continue;
+ }
+
+ i = 0;
+ while (i < line->len) {
+ x0 = (i==0) ? line->xMin : line->xRight[i-1];
+ x1 = line->xRight[i];
+ if (0.5 * (x0 + x1) > xMin) {
+ break;
}
- for (i = i1; i <= i2; ++i) {
- n = uMap->mapUnicode(str1->text[i], buf, sizeof(buf));
- s->append(buf, n);
+ ++i;
+ }
+ if (i == line->len) {
+ continue;
+ }
+
+ // insert a return
+ if (line->col[i] < col ||
+ (prevLine &&
+ line->yMin >
+ prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) {
+ s->append(eol, eolLen);
+ col = firstCol;
+ multiLine = gTrue;
+ }
+ prevLine = line;
+
+ // line this block up with the correct column
+ for (; col < line->col[i]; ++col) {
+ s->append(space, spaceLen);
+ }
+
+ // print the portion of the line
+ for (; i < line->len; ++i) {
+
+ x0 = (i==0) ? line->xMin : line->xRight[i-1];
+ x1 = line->xRight[i];
+ if (0.5 * (x0 + x1) > xMax) {
+ break;
}
- xPrev = x2;
- yPrev = str1->yMax;
+
+ len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
+ s->append(buf, len);
+ col += isUnicode ? 1 : len;
}
}
+
if (multiLine) {
s->append(eol, eolLen);
}
+
uMap->decRefCnt();
+
return s;
}
-void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) {
+GBool TextPage::findCharRange(int pos, int length,
+ double *xMin, double *yMin,
+ double *xMax, double *yMax) {
+ TextLine *line;
+ TextWord *word;
+ double x;
+ GBool first;
+ int i;
+
+ //~ this doesn't correctly handle:
+ //~ - ranges split across multiple lines (the highlighted region
+ //~ is the bounding box of all the parts of the range)
+ //~ - cases where characters don't convert one-to-one into Unicode
+ first = gTrue;
+ for (line = lines; line; line = line->pageNext) {
+ for (word = line->words; word; word = word->next) {
+ if (pos < word->charPos + word->charLen &&
+ word->charPos < pos + length) {
+ i = pos - word->charPos;
+ if (i < 0) {
+ i = 0;
+ }
+ x = (i == 0) ? word->xMin : word->xRight[i - 1];
+ if (first || x < *xMin) {
+ *xMin = x;
+ }
+ i = pos + length - word->charPos;
+ if (i >= word->len) {
+ i = word->len - 1;
+ }
+ x = word->xRight[i];
+ if (first || x > *xMax) {
+ *xMax = x;
+ }
+ if (first || word->yMin < *yMin) {
+ *yMin = word->yMin;
+ }
+ if (first || word->yMax > *yMax) {
+ *yMax = word->yMax;
+ }
+ first = gFalse;
+ }
+ }
+ }
+ return !first;
+}
+
+void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
+ GBool physLayout) {
UnicodeMap *uMap;
char space[8], eol[16], eop[8], buf[8];
- int spaceLen, eolLen, eopLen, n;
- TextString *str1, *str2, *str3;
- double yMin, yMax;
- int col1, col2, d, i;
+ int spaceLen, eolLen, eopLen, len;
+ TextFlow *flow;
+ TextLine *line;
+ int col, d, n, i;
// get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
}
eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
- // build x-major list
- xyStrings = NULL;
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
- for (str2 = NULL, str3 = xyStrings;
- str3;
- str2 = str3, str3 = str3->xyNext) {
- if (str1->xMin < str3->xMin ||
- (str1->xMin == str3->xMin && str1->yMin < str3->yMin)) {
- break;
- }
- }
- if (str2) {
- str2->xyNext = str1;
- } else {
- xyStrings = str1;
- }
- str1->xyNext = str3;
- }
+ // output the page, maintaining the original physical layout
+ if (physLayout || rawOrder) {
+ col = 0;
+ for (line = lines; line; line = line->pageNext) {
- // do column assignment
- for (str1 = xyStrings; str1; str1 = str1->xyNext) {
- col1 = 0;
- for (str2 = xyStrings; str2 != str1; str2 = str2->xyNext) {
- if (str1->xMin >= str2->xMax) {
- col2 = str2->col + str2->len + 4;
- if (col2 > col1) {
- col1 = col2;
- }
- } else if (str1->xMin > str2->xMin) {
- col2 = str2->col +
- (int)(((str1->xMin - str2->xMin) / (str2->xMax - str2->xMin)) *
- str2->len);
- if (col2 > col1) {
- col1 = col2;
+ // line this block up with the correct column
+ if (!rawOrder) {
+ for (; col < line->col[0]; ++col) {
+ (*outputFunc)(outputStream, space, spaceLen);
}
}
- }
- str1->col = col1;
- }
-
-#if 0 //~ for debugging
- fprintf((FILE *)outputStream, "~~~~~~~~~~\n");
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
- fprintf((FILE *)outputStream, "(%4d,%4d) - (%4d,%4d) [%3d] '",
- (int)str1->xMin, (int)str1->yMin,
- (int)str1->xMax, (int)str1->yMax, str1->col);
- for (i = 0; i < str1->len; ++i) {
- fputc(str1->text[i] & 0xff, stdout);
- }
- printf("'\n");
- }
- fprintf((FILE *)outputStream, "~~~~~~~~~~\n");
-#endif
-
- // output
- col1 = 0;
- yMax = yxStrings ? yxStrings->yMax : 0;
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
-
- // line this string up with the correct column
- if (rawOrder && col1 == 0) {
- col1 = str1->col;
- } else {
- for (; col1 < str1->col; ++col1) {
- (*outputFunc)(outputStream, space, spaceLen);
- }
- }
- // print the string
- for (i = 0; i < str1->len; ++i) {
- if ((n = uMap->mapUnicode(str1->text[i], buf, sizeof(buf))) > 0) {
- (*outputFunc)(outputStream, buf, n);
+ // print the line
+ for (i = 0; i < line->len; ++i) {
+ len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
+ (*outputFunc)(outputStream, buf, len);
}
- }
-
- // increment column
- col1 += str1->len;
-
- // update yMax for this line
- if (str1->yMax > yMax) {
- yMax = str1->yMax;
- }
-
- // if we've hit the end of the line...
- if (!(str1->yxNext &&
- !(rawOrder && str1->yxNext->yMax < str1->yMin) &&
- str1->yxNext->yMin < 0.2*str1->yMin + 0.8*str1->yMax &&
- str1->yxNext->xMin >= str1->xMax)) {
-
- // print a return
- (*outputFunc)(outputStream, eol, eolLen);
-
- // print extra vertical space if necessary
- if (str1->yxNext) {
-
- // find yMin for next line
- yMin = str1->yxNext->yMin;
- for (str2 = str1->yxNext; str2; str2 = str2->yxNext) {
- if (str2->yMin < yMin) {
- yMin = str2->yMin;
- }
- if (!(str2->yxNext && str2->yxNext->yMin < str2->yMax &&
- str2->yxNext->xMin >= str2->xMax))
- break;
+ col += line->convertedLen;
+
+ // print one or more returns if necessary
+ if (rawOrder ||
+ !line->pageNext ||
+ line->pageNext->col[0] < col ||
+ line->pageNext->yMin >
+ line->yMax - lineOverlapSlack * line->fontSize) {
+
+ // compute number of returns
+ d = 1;
+ if (line->pageNext) {
+ d += (int)((line->pageNext->yMin - line->yMax) /
+ line->fontSize + 0.5);
}
-
- // print the space
- d = (int)((yMin - yMax) / (str1->yMax - str1->yMin) + 0.5);
+
// various things (weird font matrices) can result in bogus
// values here, so do a sanity check
- if (rawOrder && d > 2) {
- d = 2;
- } else if (!rawOrder && d > 5) {
+ if (d < 1) {
+ d = 1;
+ } else if (d > 5) {
d = 5;
}
for (; d > 0; --d) {
(*outputFunc)(outputStream, eol, eolLen);
}
+
+ col = 0;
}
+ }
- // set up for next line
- col1 = 0;
- yMax = str1->yxNext ? str1->yxNext->yMax : 0;
+ // output the page, "undoing" the layout
+ } else {
+ for (flow = flows; flow; flow = flow->next) {
+ for (line = flow->lines; line; line = line->flowNext) {
+ n = line->len;
+ if (line->flowNext && line->hyphenated) {
+ --n;
+ }
+ for (i = 0; i < n; ++i) {
+ len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
+ (*outputFunc)(outputStream, buf, len);
+ }
+ if (line->flowNext && !line->hyphenated) {
+ (*outputFunc)(outputStream, space, spaceLen);
+ }
+ }
+ (*outputFunc)(outputStream, eol, eolLen);
+ (*outputFunc)(outputStream, eol, eolLen);
}
}
// end of page
- (*outputFunc)(outputStream, eol, eolLen);
(*outputFunc)(outputStream, eop, eopLen);
(*outputFunc)(outputStream, eol, eolLen);
uMap->decRefCnt();
}
+void TextPage::startPage(GfxState *state) {
+ clear();
+ if (state) {
+ pageWidth = state->getPageWidth();
+ pageHeight = state->getPageHeight();
+ } else {
+ pageWidth = pageHeight = 0;
+ }
+}
+
void TextPage::clear() {
- TextString *p1, *p2;
+ TextWord *w1, *w2;
+ TextFlow *f1, *f2;
- if (curStr) {
- delete curStr;
- curStr = NULL;
+ if (curWord) {
+ delete curWord;
+ curWord = NULL;
}
- for (p1 = yxStrings; p1; p1 = p2) {
- p2 = p1->yxNext;
- delete p1;
+ if (words) {
+ for (w1 = words; w1; w1 = w2) {
+ w2 = w1->next;
+ delete w1;
+ }
+ } else if (flows) {
+ for (f1 = flows; f1; f1 = f2) {
+ f2 = f1->next;
+ delete f1;
+ }
}
- yxStrings = NULL;
- xyStrings = NULL;
- yxCur1 = yxCur2 = NULL;
+ deleteGList(fonts, TextFontInfo);
+
+ curWord = NULL;
+ charPos = 0;
+ font = NULL;
+ fontSize = 0;
+ nest = 0;
+ nTinyChars = 0;
+ words = wordPtr = NULL;
+ lines = NULL;
+ flows = NULL;
+ fonts = new GList();
+
}
+
//------------------------------------------------------------------------
// TextOutputDev
//------------------------------------------------------------------------
fwrite(text, 1, len, (FILE *)stream);
}
-TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) {
+TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
+ GBool rawOrderA, GBool append) {
text = NULL;
+ physLayout = physLayoutA;
rawOrder = rawOrderA;
ok = gTrue;
if (fileName) {
if (!strcmp(fileName, "-")) {
outputStream = stdout;
+#ifdef WIN32
+ // keep DOS from munging the end-of-line characters
+ setmode(fileno(stdout), O_BINARY);
+#endif
} else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
needClose = gTrue;
} else {
}
// set up text object
- text = new TextPage(rawOrder);
+ text = new TextPage(rawOrderA);
}
TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
- GBool rawOrderA) {
+ GBool physLayoutA, GBool rawOrderA) {
outputFunc = func;
outputStream = stream;
needClose = gFalse;
+ physLayout = physLayoutA;
rawOrder = rawOrderA;
- text = new TextPage(rawOrder);
+ text = new TextPage(rawOrderA);
ok = gTrue;
}
}
void TextOutputDev::startPage(int pageNum, GfxState *state) {
- text->clear();
+ text->startPage(state);
}
void TextOutputDev::endPage() {
- text->coalesce();
+ text->coalesce(physLayout);
if (outputStream) {
- text->dump(outputStream, outputFunc);
+ text->dump(outputStream, outputFunc, physLayout);
}
}
}
void TextOutputDev::beginString(GfxState *state, GString *s) {
- text->beginString(state);
+ text->beginWord(state, state->getCurX(), state->getCurY());
}
void TextOutputDev::endString(GfxState *state) {
- text->endString();
+ text->endWord();
}
void TextOutputDev::drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode c, Unicode *u, int uLen) {
- text->addChar(state, x, y, dx, dy, u, uLen);
+ text->addChar(state, x, y, dx, dy, c, u, uLen);
}
GBool TextOutputDev::findText(Unicode *s, int len,
double *xMax, double *yMax) {
return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax);
}
+
+GString *TextOutputDev::getText(double xMin, double yMin,
+ double xMax, double yMax) {
+ return text->getText(xMin, yMin, xMax, yMax);
+}
+
+GBool TextOutputDev::findCharRange(int pos, int length,
+ double *xMin, double *yMin,
+ double *xMax, double *yMax) {
+ return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
+}
+
+