1 //========================================================================
5 // Copyright 1997-2003 Glyph & Cog, LLC
7 //========================================================================
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
21 #include <fcntl.h> // for O_BINARY
22 #include <io.h> // for setmode
29 #include "GlobalParams.h"
30 #include "UnicodeMap.h"
32 #include "TextOutputDev.h"
35 // needed for setting type/creator of MacOS files
36 #include "ICSupport.h"
39 //------------------------------------------------------------------------
41 //------------------------------------------------------------------------
43 // Minium and maximum inter-word spacing (as a fraction of the average
45 #define wordMinSpaceWidth 0.3
46 #define wordMaxSpaceWidth 2.0
48 // Default min and max inter-word spacing (when the average character
50 #define wordDefMinSpaceWidth 0.2
51 #define wordDefMaxSpaceWidth 1.5
53 // Max difference in x,y coordinates (as a fraction of the font size)
54 // allowed for duplicated text (fake boldface, drop shadows) which is
56 #define dupMaxDeltaX 0.1
57 #define dupMaxDeltaY 0.2
59 // Min overlap (as a fraction of the font size) required for two
60 // lines to be considered vertically overlapping.
61 #define lineOverlapSlack 0.5
63 // Max difference in baseline y coordinates (as a fraction of the font
64 // size) allowed for words which are to be grouped into a line, not
65 // including sub/superscripts.
66 #define lineMaxBaselineDelta 0.1
68 // Max ratio of font sizes allowed for words which are to be grouped
69 // into a line, not including sub/superscripts.
70 #define lineMaxFontSizeRatio 1.4
72 // Min spacing (as a fraction of the font size) allowed between words
73 // which are to be grouped into a line.
74 #define lineMinDeltaX -0.5
76 // Minimum vertical overlap (as a fraction of the font size) required
77 // for superscript and subscript words.
78 #define lineMinSuperscriptOverlap 0.3
79 #define lineMinSubscriptOverlap 0.3
81 // Min/max ratio of font sizes allowed for sub/superscripts compared to
83 #define lineMinSubscriptFontSizeRatio 0.4
84 #define lineMaxSubscriptFontSizeRatio 1.01
85 #define lineMinSuperscriptFontSizeRatio 0.4
86 #define lineMaxSuperscriptFontSizeRatio 1.01
88 // Max horizontal spacing (as a fraction of the font size) allowed
89 // before sub/superscripts.
90 #define lineMaxSubscriptDeltaX 0.2
91 #define lineMaxSuperscriptDeltaX 0.2
93 // Maximum vertical spacing (as a fraction of the font size) allowed
94 // for lines which are to be grouped into a block.
95 #define blkMaxSpacing 2.0
97 // Max ratio of primary font sizes allowed for lines which are to be
98 // grouped into a block.
99 #define blkMaxFontSizeRatio 1.3
101 // Min overlap (as a fraction of the font size) required for two
102 // blocks to be considered vertically overlapping.
103 #define blkOverlapSlack 0.5
105 // Max vertical spacing (as a fraction of the font size) allowed
106 // between blocks which are 'adjacent' when sorted by reading order.
107 #define blkMaxSortSpacing 2.0
109 // Max vertical offset (as a fraction of the font size) of the top and
110 // bottom edges allowed for blocks which are to be grouped into a
112 #define flowMaxDeltaY 1.0
114 //------------------------------------------------------------------------
116 //------------------------------------------------------------------------
118 TextFontInfo::TextFontInfo(GfxState *state) {
120 double t1, t2, avgWidth, w;
123 gfxFont = state->getFont();
124 textMat = state->getTextMat();
125 horizScaling = state->getHorizScaling();
126 if ((t1 = fabs(textMat[0])) > 0.01 &&
127 (t2 = fabs(textMat[3])) > 0.01) {
128 horizScaling *= t1 / t2;
131 minSpaceWidth = horizScaling * wordDefMinSpaceWidth;
132 maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth;
133 if (gfxFont && gfxFont->isCIDFont()) {
134 //~ handle 16-bit fonts
135 } else if (gfxFont && gfxFont->getType() != fontType3) {
138 for (i = 0; i < 256; ++i) {
139 w = ((Gfx8BitFont *)gfxFont)->getWidth(i);
147 minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth;
148 maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth;
154 TextFontInfo::~TextFontInfo() {
157 GBool TextFontInfo::matches(GfxState *state) {
161 textMat = state->getTextMat();
162 h = state->getHorizScaling();
163 if ((t1 = fabs(textMat[0])) > 0.01 &&
164 (t2 = fabs(textMat[3])) > 0.01) {
167 return state->getFont() == gfxFont &&
168 fabs(h - horizScaling) < 0.01;
171 //------------------------------------------------------------------------
173 //------------------------------------------------------------------------
175 TextWord::TextWord(GfxState *state, double x0, double y0, int charPosA,
176 TextFontInfo *fontA, double fontSizeA) {
183 fontSize = fontSizeA;
184 state->transform(x0, y0, &x, &y);
185 if ((gfxFont = font->gfxFont)) {
186 yMin = y - gfxFont->getAscent() * fontSize;
187 yMax = y - gfxFont->getDescent() * fontSize;
189 // this means that the PDF file draws text without a current font,
190 // which should never happen
191 yMin = y - 0.95 * fontSize;
192 yMax = y + 0.35 * fontSize;
195 // this is a sanity check for a case that shouldn't happen -- but
196 // if it does happen, we want to avoid dividing by zero later
210 TextWord::~TextWord() {
215 void TextWord::addChar(GfxState *state, double x, double y,
216 double dx, double dy, Unicode u) {
219 text = (Unicode *)grealloc(text, size * sizeof(Unicode));
220 xRight = (double *)grealloc(xRight, size * sizeof(double));
226 xMax = xRight[len] = x + dx;
230 // Returns true if <this> comes before <word2> in xy order.
231 GBool TextWord::xyBefore(TextWord *word2) {
232 return xMin < word2->xMin ||
233 (xMin == word2->xMin && yMin < word2->yMin);
236 // Merge another word onto the end of this one.
237 void TextWord::merge(TextWord *word2) {
241 if (word2->yMin < yMin) {
244 if (word2->yMax > yMax) {
247 if (len + word2->len > size) {
248 size = len + word2->len;
249 text = (Unicode *)grealloc(text, size * sizeof(Unicode));
250 xRight = (double *)grealloc(xRight, size * sizeof(double));
252 for (i = 0; i < word2->len; ++i) {
253 text[len + i] = word2->text[i];
254 xRight[len + i] = word2->xRight[i];
257 charLen += word2->charLen;
260 //------------------------------------------------------------------------
262 //------------------------------------------------------------------------
264 TextLine::TextLine() {
276 TextLine::~TextLine() {
279 for (w1 = words; w1; w1 = w2) {
288 // Returns true if <this> comes before <line2> in yx order, allowing
289 // slack for vertically overlapping lines.
290 GBool TextLine::yxBefore(TextLine *line2) {
293 dy = lineOverlapSlack * fontSize;
295 // non-overlapping case
296 if (line2->yMin > yMax - dy ||
297 line2->yMax < yMin + dy) {
298 return yMin < line2->yMin ||
299 (yMin == line2->yMin && xMin < line2->xMin);
303 return xMin < line2->xMin;
306 // Merge another line's words onto the end of this line.
307 void TextLine::merge(TextLine *line2) {
311 if (line2->yMin < yMin) {
314 if (line2->yMax > yMax) {
317 xSpaceR = line2->xSpaceR;
318 lastWord->spaceAfter = gTrue;
319 lastWord->next = line2->words;
320 lastWord = line2->lastWord;
322 newLen = len + 1 + line2->len;
323 text = (Unicode *)grealloc(text, newLen * sizeof(Unicode));
324 xRight = (double *)grealloc(xRight, newLen * sizeof(double));
325 text[len] = (Unicode)0x0020;
326 xRight[len] = line2->xMin;
327 for (i = 0; i < line2->len; ++i) {
328 text[len + 1 + i] = line2->text[i];
329 xRight[len + 1 + i] = line2->xRight[i];
332 convertedLen += line2->convertedLen;
333 hyphenated = line2->hyphenated;
336 //------------------------------------------------------------------------
338 //------------------------------------------------------------------------
340 TextBlock::TextBlock() {
345 TextBlock::~TextBlock() {
348 for (l1 = lines; l1; l1 = l2) {
354 // Returns true if <this> comes before <blk2> in xy order, allowing
355 // slack for vertically overlapping blocks.
356 GBool TextBlock::yxBefore(TextBlock *blk2) {
359 dy = blkOverlapSlack * lines->fontSize;
361 // non-overlapping case
362 if (blk2->yMin > yMax - dy ||
363 blk2->yMax < yMin + dy) {
364 return yMin < blk2->yMin ||
365 (yMin == blk2->yMin && xMin < blk2->xMin);
369 return xMin < blk2->xMin;
372 // Merge another block's line onto the right of this one.
373 void TextBlock::mergeRight(TextBlock *blk2) {
374 lines->merge(blk2->lines);
378 xSpaceR = lines->xSpaceR;
381 // Merge another block's lines onto the bottom of this block.
382 void TextBlock::mergeBelow(TextBlock *blk2) {
385 if (blk2->xMin < xMin) {
388 if (blk2->xMax > xMax) {
392 if (blk2->xSpaceL > xSpaceL) {
393 xSpaceL = blk2->xSpaceL;
395 if (blk2->xSpaceR < xSpaceR) {
396 xSpaceR = blk2->xSpaceR;
398 if (blk2->maxFontSize > maxFontSize) {
399 maxFontSize = blk2->maxFontSize;
401 for (line = lines; line->next; line = line->next) ;
402 line->next = line->flowNext = blk2->lines;
406 //------------------------------------------------------------------------
408 //------------------------------------------------------------------------
410 TextFlow::TextFlow() {
415 TextFlow::~TextFlow() {
418 for (b1 = blocks; b1; b1 = b2) {
425 //------------------------------------------------------------------------
427 //------------------------------------------------------------------------
429 TextPage::TextPage(GBool rawOrderA) {
430 rawOrder = rawOrderA;
437 words = wordPtr = NULL;
443 TextPage::~TextPage() {
448 void TextPage::updateFont(GfxState *state) {
452 int code, mCode, letterCode, anyCode;
456 // get the font info object
458 for (i = 0; i < fonts->getLength(); ++i) {
459 font = (TextFontInfo *)fonts->get(i);
460 if (font->matches(state)) {
466 font = new TextFontInfo(state);
470 // adjust the font size
471 gfxFont = state->getFont();
472 fontSize = state->getTransformedFontSize();
473 if (gfxFont && gfxFont->getType() == fontType3) {
474 // This is a hack which makes it possible to deal with some Type 3
475 // fonts. The problem is that it's impossible to know what the
476 // base coordinate system used in the font is without actually
477 // rendering the font. This code tries to guess by looking at the
478 // width of the character 'm' (which breaks if the font is a
479 // subset that doesn't contain 'm').
480 mCode = letterCode = anyCode = -1;
481 for (code = 0; code < 256; ++code) {
482 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
483 if (name && name[0] == 'm' && name[1] == '\0') {
486 if (letterCode < 0 && name && name[1] == '\0' &&
487 ((name[0] >= 'A' && name[0] <= 'Z') ||
488 (name[0] >= 'a' && name[0] <= 'z'))) {
491 if (anyCode < 0 && name &&
492 ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
497 (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
498 // 0.6 is a generic average 'm' width -- yes, this is a hack
500 } else if (letterCode >= 0 &&
501 (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
502 // even more of a hack: 0.5 is a generic letter width
504 } else if (anyCode >= 0 &&
505 (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
506 // better than nothing: 0.5 is a generic character width
509 fm = gfxFont->getFontMatrix();
511 fontSize *= fabs(fm[3] / fm[0]);
516 void TextPage::beginWord(GfxState *state, double x0, double y0) {
517 // This check is needed because Type 3 characters can contain
518 // text-drawing operations (when TextPage is being used via
519 // XOutputDev rather than TextOutputDev).
525 curWord = new TextWord(state, x0, y0, charPos, font, fontSize);
528 void TextPage::addChar(GfxState *state, double x, double y,
529 double dx, double dy,
530 CharCode c, Unicode *u, int uLen) {
531 double x1, y1, w1, h1, dx2, dy2, sp;
534 // if the previous char was a space, addChar will have called
535 // endWord, so we need to start a new word
537 beginWord(state, x, y);
540 // throw away chars that aren't inside the page bounds
541 state->transform(x, y, &x1, &y1);
542 if (x1 < 0 || x1 > pageWidth ||
543 y1 < 0 || y1 > pageHeight) {
547 // subtract char and word spacing from the dx,dy values
548 sp = state->getCharSpace();
549 if (c == (CharCode)0x20) {
550 sp += state->getWordSpace();
552 state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
555 state->transformDelta(dx, dy, &w1, &h1);
557 // check the tiny chars limit
558 if (!globalParams->getTextKeepTinyChars() &&
559 fabs(w1) < 3 && fabs(h1) < 3) {
560 if (++nTinyChars > 20000) {
565 // break words at space character
566 if (uLen == 1 && u[0] == (Unicode)0x20) {
573 // large char spacing is sometimes used to move text around -- in
574 // this case, break text into individual chars and let the coalesce
575 // function deal with it later
577 if (n > 0 && x1 - curWord->xRight[n-1] >
578 curWord->font->minSpaceWidth * curWord->fontSize) {
580 beginWord(state, x, y);
583 // page rotation and/or transform matrices can cause text to be
584 // drawn in reverse order -- in this case, swap the begin/end
585 // coordinates and break text into individual chars
588 beginWord(state, x + dx, y + dy);
595 // add the characters to the current word
600 for (i = 0; i < uLen; ++i) {
601 curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
607 void TextPage::endWord() {
608 // This check is needed because Type 3 characters can contain
609 // text-drawing operations (when TextPage is being used via
610 // XOutputDev rather than TextOutputDev).
622 void TextPage::addWord(TextWord *word) {
625 // throw away zero-length words -- they don't have valid xMin/xMax
626 // values, and they're useless anyway
627 if (word->len == 0) {
632 // insert word in xy list
637 if (wordPtr && wordPtr->xyBefore(word)) {
644 for (; p2; p1 = p2, p2 = p2->next) {
645 if (word->xyBefore(p2)) {
659 void TextPage::coalesce(GBool physLayout) {
660 TextWord *word0, *word1, *word2;
661 TextLine *line0, *line1, *line2, *line3, *line4, *lineList;
662 TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6;
663 TextBlock *yxBlocks, *blocks, *blkStack;
664 TextFlow *flow0, *flow1;
665 double sz, xLimit, yLimit;
666 double fit1, fit2, sp1, sp2 = 0.0e+0;
671 int col1, col2, d, i, j;
673 #if 0 // for debugging
674 printf("*** initial word list ***\n");
675 for (word0 = words; word0; word0 = word0->next) {
676 printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
677 word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
678 for (i = 0; i < word0->len; ++i) {
679 fputc(word0->text[i] & 0xff, stdout);
687 //----- discard duplicated text (fake boldface, drop shadows)
691 sz = word0->fontSize;
692 xLimit = word0->xMin + sz * dupMaxDeltaX;
694 for (word1 = word0, word2 = word0->next;
695 word2 && word2->xMin < xLimit;
696 word1 = word2, word2 = word2->next) {
697 if (word2->len == word0->len &&
698 !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) &&
699 fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY &&
700 fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY &&
701 fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) {
707 word1->next = word2->next;
714 #if 0 // for debugging
715 printf("*** words after removing duplicate text ***\n");
716 for (word0 = words; word0; word0 = word0->next) {
717 printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
718 word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
719 for (i = 0; i < word0->len; ++i) {
720 fputc(word0->text[i] & 0xff, stdout);
732 sz = word0->fontSize;
734 // look for adjacent text which is part of the same word, and
735 // merge it into this word
736 xLimit = word0->xMax + sz * word0->font->minSpaceWidth;
741 word2->xMin < xLimit &&
742 word2->font == word0->font &&
743 fabs(word2->fontSize - sz) < 0.05 &&
744 fabs(word2->yBase - word0->yBase) < 0.05 &&
745 word2->charPos == word0->charPos + word0->charLen;
748 for (word1 = word0, word2 = word0->next;
749 word2 && word2->xMin < xLimit;
750 word1 = word2, word2 = word2->next) {
751 if (word2->font == word0->font &&
752 fabs(word2->fontSize - sz) < 0.05 &&
753 fabs(word2->yBase - word0->yBase) < 0.05 &&
754 word2->charPos == word0->charPos + word0->charLen) {
762 word1->next = word2->next;
770 #if 0 // for debugging
771 printf("*** after merging words ***\n");
772 for (word0 = words; word0; word0 = word0->next) {
773 printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
774 word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
775 for (i = 0; i < word0->len; ++i) {
776 fputc(word0->text[i] & 0xff, stdout);
784 //----- assemble words into lines
786 lineList = line0 = NULL;
789 // remove the first word from the word list
794 // find the best line (if any) for the word
796 if (line0 && lineFit(line0, word0, &sp2) >= 0) {
807 for (line2 = lineList; line2; line2 = line2->next) {
808 fit2 = lineFit(line2, word0, &sp2);
809 if (fit2 >= 0 && (!line1 || fit2 < fit1)) {
817 // found a line: append the word
819 word1 = line1->lastWord;
821 line1->lastWord = word0;
822 if (word0->xMax > line1->xMax) {
823 line1->xMax = word0->xMax;
825 if (word0->yMin < line1->yMin) {
826 line1->yMin = word0->yMin;
828 if (word0->yMax > line1->yMax) {
829 line1->yMax = word0->yMax;
831 line1->len += word0->len;
832 if (sp1 > line1->fontSize * line1->font->minSpaceWidth) {
833 word1->spaceAfter = gTrue;
837 // didn't find a line: create a new line
839 line1 = new TextLine();
840 line1->words = line1->lastWord = word0;
841 line1->xMin = word0->xMin;
842 line1->xMax = word0->xMax;
843 line1->yMin = word0->yMin;
844 line1->yMax = word0->yMax;
845 line1->yBase = word0->yBase;
846 line1->font = word0->font;
847 line1->fontSize = word0->fontSize;
848 line1->len = word0->len;
858 // build the line text
859 uMap = globalParams->getTextEncoding();
860 isUnicode = uMap ? uMap->isUnicode() : gFalse;
862 for (line1 = lineList; line1; line1 = line1->next) {
863 line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode));
864 line1->xRight = (double *)gmalloc(line1->len * sizeof(double));
865 line1->col = (int *)gmalloc(line1->len * sizeof(int));
867 for (word1 = line1->words; word1; word1 = word1->next) {
868 for (j = 0; j < word1->len; ++j) {
869 line1->text[i] = word1->text[j];
870 line1->xRight[i] = word1->xRight[j];
873 if (word1->spaceAfter && word1->next) {
874 line1->text[i] = (Unicode)0x0020;
875 line1->xRight[i] = word1->next->xMin;
879 line1->convertedLen = 0;
880 for (j = 0; j < line1->len; ++j) {
881 line1->col[j] = line1->convertedLen;
883 ++line1->convertedLen;
885 line1->convertedLen +=
886 uMap->mapUnicode(line1->text[j], buf, sizeof(buf));
890 // check for hyphen at end of line
891 //~ need to check for other chars used as hyphens
892 if (line1->text[line1->len - 1] == (Unicode)'-') {
893 line1->hyphenated = gTrue;
902 #if 0 // for debugging
903 printf("*** lines in xy order ***\n");
904 for (line0 = lineList; line0; line0 = line0->next) {
905 printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
906 line0->xMin, line0->xMax, line0->yMin, line0->yMax,
907 line0->yBase, line0->len);
908 for (word0 = line0->words; word0; word0 = word0->next) {
909 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
910 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
911 word0->yBase, word0->fontSize, word0->spaceAfter);
912 for (i = 0; i < word0->len; ++i) {
913 fputc(word0->text[i] & 0xff, stdout);
922 //----- column assignment
924 for (line1 = lineList; line1; line1 = line1->next) {
926 for (line2 = lineList; line2 != line1; line2 = line2->next) {
927 if (line1->xMin >= line2->xMax) {
928 d = (int)((line1->xMin - line2->xMax) /
929 (line1->font->maxSpaceWidth * line1->fontSize));
933 col2 = line2->col[0] + line2->convertedLen + d;
937 } else if (line1->xMin > line2->xMin) {
938 for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ;
939 col2 = line2->col[i];
945 for (j = 0; j < line1->len; ++j) {
946 line1->col[j] += col1;
950 #if 0 // for debugging
951 printf("*** lines in xy order, after column assignment ***\n");
952 for (line0 = lineList; line0; line0 = line0->next) {
953 printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f col=%d len=%d]\n",
954 line0->xMin, line0->xMax, line0->yMin, line0->yMax,
955 line0->yBase, line0->col[0], line0->len);
956 for (word0 = line0->words; word0; word0 = word0->next) {
957 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
958 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
959 word0->yBase, word0->fontSize, word0->spaceAfter);
960 for (i = 0; i < word0->len; ++i) {
961 fputc(word0->text[i] & 0xff, stdout);
970 //----- assemble lines into blocks
975 for (line1 = lines; line1; line1 = line1->next) {
977 line1->xSpaceR = pageWidth;
982 // sort lines into yx order
986 lineList = lineList->next;
987 for (line1 = NULL, line2 = lines;
988 line2 && !line0->yxBefore(line2);
989 line1 = line2, line2 = line2->next) ;
998 // compute whitespace to left and right of each line
1000 for (line1 = lines; line1; line1 = line1->next) {
1002 // find the first vertically overlapping line
1003 for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ;
1005 // check each vertically overlapping line -- look for the nearest
1008 line1->xSpaceR = pageWidth;
1010 line2 && line2->yMin < line1->yMax;
1011 line2 = line2->next) {
1012 if (line2->yMax > line1->yMin) {
1013 if (line2->xMax < line1->xMin) {
1014 if (line2->xMax > line1->xSpaceL) {
1015 line1->xSpaceL = line2->xMax;
1017 } else if (line2->xMin > line1->xMax) {
1018 if (line2->xMin < line1->xSpaceR) {
1019 line1->xSpaceR = line2->xMin;
1027 #if 0 // for debugging
1028 printf("*** lines in yx order ***\n");
1029 for (line0 = lines; line0; line0 = line0->next) {
1030 printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n",
1031 line0->xMin, line0->xMax, line0->yMin, line0->yMax,
1032 line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len);
1033 for (word0 = line0->words; word0; word0 = word0->next) {
1034 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
1035 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
1036 word0->yBase, word0->fontSize, word0->spaceAfter);
1037 for (i = 0; i < word0->len; ++i) {
1038 fputc(word0->text[i] & 0xff, stdout);
1052 // build a new block object
1054 lineList = lineList->next;
1056 blk1 = new TextBlock();
1057 blk1->lines = line0;
1058 blk1->xMin = line0->xMin;
1059 blk1->xMax = line0->xMax;
1060 blk1->yMin = line0->yMin;
1061 blk1->yMax = line0->yMax;
1062 blk1->xSpaceL = line0->xSpaceL;
1063 blk1->xSpaceR = line0->xSpaceR;
1064 blk1->maxFontSize = line0->fontSize;
1066 // find subsequent lines in the block
1069 // look for the first horizontally overlapping line below this
1071 yLimit = line0->yMax + blkMaxSpacing * line0->fontSize;
1072 line3 = line4 = NULL;
1074 if (lineList->yMin < yLimit &&
1075 lineList->xMax > blk1->xMin &&
1076 lineList->xMin < blk1->xMax) {
1081 for (line1 = NULL, line2 = lineList;
1082 line2 && line2->yMin < yLimit;
1083 line1 = line2, line2 = line2->next) {
1084 if (line2->xMax > blk1->xMin &&
1085 line2->xMin < blk1->xMax) {
1093 // if there is an overlapping line and it fits in the block, add
1095 if (line4 && blockFit(blk1, line4)) {
1097 line3->next = line4->next;
1099 lineList = line4->next;
1101 line0->next = line0->flowNext = line4;
1103 if (line4->xMin < blk1->xMin) {
1104 blk1->xMin = line4->xMin;
1105 } else if (line4->xMax > blk1->xMax) {
1106 blk1->xMax = line4->xMax;
1108 if (line4->yMax > blk1->yMax) {
1109 blk1->yMax = line4->yMax;
1111 if (line4->xSpaceL > blk1->xSpaceL) {
1112 blk1->xSpaceL = line4->xSpaceL;
1114 if (line4->xSpaceR < blk1->xSpaceR) {
1115 blk1->xSpaceR = line4->xSpaceR;
1117 if (line4->fontSize > blk1->maxFontSize) {
1118 blk1->maxFontSize = line4->fontSize;
1122 // otherwise, we're done with this block
1128 // insert block on list, in yx order
1134 for (blk2 = NULL, blk3 = yxBlocks;
1135 blk3 && !blk1->yxBefore(blk3);
1136 blk2 = blk3, blk3 = blk3->next) ;
1146 #if 0 // for debugging
1147 printf("*** blocks in yx order ***\n");
1148 for (blk0 = yxBlocks; blk0; blk0 = blk0->next) {
1149 printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
1150 blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
1151 for (line0 = blk0->lines; line0; line0 = line0->next) {
1152 printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
1153 line0->xMin, line0->xMax, line0->yMin, line0->yMax,
1154 line0->yBase, line0->len);
1155 for (word0 = line0->words; word0; word0 = word0->next) {
1156 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
1157 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
1158 word0->yBase, word0->spaceAfter);
1159 for (i = 0; i < word0->len; ++i) {
1160 fputc(word0->text[i] & 0xff, stdout);
1170 //----- merge lines and blocks, sort blocks into reading order
1181 // find the next two blocks:
1182 // - if the depth-first traversal stack is empty, take the first
1183 // (upper-left-most) two blocks on the yx-sorted block list
1184 // - otherwise, find the two upper-left-most blocks under the top
1185 // block on the stack
1187 blk3 = blk4 = blk5 = blk6 = NULL;
1188 for (blk1 = NULL, blk2 = yxBlocks;
1190 blk1 = blk2, blk2 = blk2->next) {
1191 if (blk2->yMin > blkStack->yMin &&
1192 blk2->xMax > blkStack->xMin &&
1193 blk2->xMin < blkStack->xMax) {
1194 if (!blk4 || blk2->yxBefore(blk4)) {
1199 } else if (!blk6 || blk2->yxBefore(blk6)) {
1209 blk6 = yxBlocks->next;
1214 // | blkStack | | blkStack
1215 // +---------------------+ --> +--------------
1216 // +------+ +------+ +-----------+
1217 // | blk4 | | blk6 | ... | blk4+blk6 |
1218 // +------+ +------+ +-----------+
1219 yLimit = 0; // make gcc happy
1221 yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize;
1223 if (blkStack && blk4 && blk6 &&
1224 !blk4->lines->next && !blk6->lines->next &&
1225 lineFit2(blk4->lines, blk6->lines) &&
1226 blk4->yMin < yLimit &&
1227 blk4->xMin > blkStack->xSpaceL &&
1228 blkStack->xMin > blk4->xSpaceL &&
1229 blk6->xMax < blkStack->xSpaceR) {
1230 blk4->mergeRight(blk6);
1232 blk5->next = blk6->next;
1234 yxBlocks = blk6->next;
1241 // +---------------------+ --> | blkStack+blk2 |
1242 // +---------------------+ | |
1245 } else if (blkStack && blk4 &&
1246 blk4->yMin < yLimit &&
1247 blockFit2(blkStack, blk4)) {
1248 blkStack->mergeBelow(blk4);
1250 blk3->next = blk4->next;
1252 yxBlocks = blk4->next;
1257 // 1. no block found
1258 // 2. non-fully overlapping block found
1259 // 3. large vertical gap above the overlapping block
1260 // then pop the stack and try again
1262 (blkStack && (blk4->xMin < blkStack->xSpaceL ||
1263 blk4->xMax > blkStack->xSpaceR ||
1264 blk4->yMin - blkStack->yMax >
1265 blkMaxSortSpacing * blkStack->maxFontSize))) {
1266 blkStack = blkStack->stackNext;
1268 // add a block to the sorted list
1271 // remove the block from the yx-sorted list
1273 blk3->next = blk4->next;
1275 yxBlocks = blk4->next;
1279 // append the block to the reading-order list
1287 // push the block on the traversal stack
1289 blk4->stackNext = blkStack;
1296 #if 0 // for debugging
1297 printf("*** blocks in reading order (after merging) ***\n");
1298 for (blk0 = blocks; blk0; blk0 = blk0->next) {
1299 printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
1300 blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
1301 for (line0 = blk0->lines; line0; line0 = line0->next) {
1302 printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
1303 line0->xMin, line0->xMax, line0->yMin, line0->yMax,
1304 line0->yBase, line0->len);
1305 for (word0 = line0->words; word0; word0 = word0->next) {
1306 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
1307 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
1308 word0->yBase, word0->spaceAfter);
1309 for (i = 0; i < word0->len; ++i) {
1310 fputc(word0->text[i] & 0xff, stdout);
1320 //----- assemble blocks into flows
1324 // one flow per block
1327 flow1 = new TextFlow();
1328 flow1->blocks = blocks;
1329 flow1->lines = blocks->lines;
1330 flow1->yMin = blocks->yMin;
1331 flow1->yMax = blocks->yMax;
1332 blocks = blocks->next;
1333 flow1->blocks->next = NULL;
1335 flow0->next = flow1;
1344 // compute whitespace above and below each block
1345 for (blk0 = blocks; blk0; blk0 = blk0->next) {
1347 blk0->ySpaceB = pageHeight;
1349 // check each horizontally overlapping block
1350 for (blk1 = blocks; blk1; blk1 = blk1->next) {
1352 blk1->xMin < blk0->xMax &&
1353 blk1->xMax > blk0->xMin) {
1354 if (blk1->yMax < blk0->yMin) {
1355 if (blk1->yMax > blk0->ySpaceT) {
1356 blk0->ySpaceT = blk1->yMax;
1358 } else if (blk1->yMin > blk0->yMax) {
1359 if (blk1->yMin < blk0->ySpaceB) {
1360 blk0->ySpaceB = blk1->yMin;
1370 // build a new flow object
1371 flow1 = new TextFlow();
1372 flow1->blocks = blocks;
1373 flow1->lines = blocks->lines;
1374 flow1->yMin = blocks->yMin;
1375 flow1->yMax = blocks->yMax;
1376 flow1->ySpaceT = blocks->ySpaceT;
1377 flow1->ySpaceB = blocks->ySpaceB;
1379 // find subsequent blocks in the flow
1380 for (blk1 = blocks, blk2 = blocks->next;
1381 blk2 && flowFit(flow1, blk2);
1382 blk1 = blk2, blk2 = blk2->next) {
1383 if (blk2->yMin < flow1->yMin) {
1384 flow1->yMin = blk2->yMin;
1386 if (blk2->yMax > flow1->yMax) {
1387 flow1->yMax = blk2->yMax;
1389 if (blk2->ySpaceT > flow1->ySpaceT) {
1390 flow1->ySpaceT = blk2->ySpaceT;
1392 if (blk2->ySpaceB < flow1->ySpaceB) {
1393 flow1->ySpaceB = blk2->ySpaceB;
1395 for (line1 = blk1->lines; line1->next; line1 = line1->next) ;
1396 line1->flowNext = blk2->lines;
1399 // chop the block list
1400 blocks = blk1->next;
1403 // append the flow to the list
1405 flow0->next = flow1;
1413 #if 0 // for debugging
1414 printf("*** flows ***\n");
1415 for (flow0 = flows; flow0; flow0 = flow0->next) {
1417 for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) {
1418 printf(" [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n",
1419 blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax,
1420 blk0->ySpaceT, blk0->ySpaceB);
1421 for (line0 = blk0->lines; line0; line0 = line0->next) {
1422 printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
1423 line0->xMin, line0->xMax, line0->yMin, line0->yMax,
1424 line0->yBase, line0->len);
1425 for (word0 = line0->words; word0; word0 = word0->next) {
1426 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
1427 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
1428 word0->yBase, word0->spaceAfter);
1429 for (i = 0; i < word0->len; ++i) {
1430 fputc(word0->text[i] & 0xff, stdout);
1441 //----- sort lines into yx order
1443 // (the block/line merging process doesn't maintain the full-page
1444 // linked list of lines)
1449 for (flow0 = flows; flow0; flow0 = flow0->next) {
1450 for (line1 = flow0->lines; line1; line1 = line1->flowNext) {
1452 line0->pageNext = line1;
1460 for (flow0 = flows; flow0; flow0 = flow0->next) {
1461 for (line0 = flow0->lines; line0; line0 = line0->flowNext) {
1462 for (line1 = NULL, line2 = lines;
1463 line2 && !line0->yxBefore(line2);
1464 line1 = line2, line2 = line2->pageNext) ;
1466 line1->pageNext = line0;
1470 line0->pageNext = line2;
1475 #if 0 // for debugging
1476 printf("*** lines in yx order ***\n");
1477 for (line0 = lines; line0; line0 = line0->pageNext) {
1478 printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n",
1479 line0->xMin, line0->xMax, line0->yMin, line0->yMax,
1480 line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0],
1482 for (word0 = line0->words; word0; word0 = word0->next) {
1483 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
1484 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
1485 word0->yBase, word0->spaceAfter);
1486 for (i = 0; i < word0->len; ++i) {
1487 fputc(word0->text[i] & 0xff, stdout);
1497 // If <word> can be added the end of <line>, return the absolute value
1498 // of the difference between <line>'s baseline and <word>'s baseline,
1499 // and set *<space> to the horizontal space between the current last
1500 // word in <line> and <word>. A smaller return value indicates a
1501 // better fit. Otherwise, return a negative number.
1502 double TextPage::lineFit(TextLine *line, TextWord *word, double *space) {
1504 double fontSize0, fontSize1;
1507 lastWord = line->lastWord;
1508 fontSize0 = line->fontSize;
1509 fontSize1 = word->fontSize;
1510 dx = word->xMin - lastWord->xMax;
1511 dxLimit = fontSize0 * lastWord->font->maxSpaceWidth;
1513 // check inter-word spacing
1514 if (dx < fontSize0 * lineMinDeltaX ||
1520 // look for adjacent words with close baselines and close font sizes
1521 (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 &&
1522 fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
1523 fontSize1 < lineMaxFontSizeRatio * fontSize0) ||
1525 // look for a superscript
1526 (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 &&
1527 fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 &&
1528 (word->yMax < lastWord->yMax ||
1529 word->yBase < lastWord->yBase) &&
1530 word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 &&
1531 dx < fontSize0 * lineMaxSuperscriptDeltaX) ||
1533 // look for a subscript
1534 (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 &&
1535 fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 &&
1536 (word->yMin > lastWord->yMin ||
1537 word->yBase > lastWord->yBase) &&
1538 line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 &&
1539 dx < fontSize0 * lineMaxSubscriptDeltaX)) {
1542 return fabs(word->yBase - line->yBase);
1548 // Returns true if <line0> and <line1> can be merged into a single
1549 // line, ignoring max word spacing.
1550 GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) {
1551 double fontSize0, fontSize1;
1554 fontSize0 = line0->fontSize;
1555 fontSize1 = line1->fontSize;
1556 dx = line1->xMin - line0->xMax;
1558 // check inter-word spacing
1559 if (dx < fontSize0 * lineMinDeltaX) {
1563 // look for close baselines and close font sizes
1564 if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 &&
1565 fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
1566 fontSize1 < lineMaxFontSizeRatio * fontSize0) {
1573 // Returns true if <line> can be added to <blk>. Assumes the y
1574 // coordinates are within range.
1575 GBool TextPage::blockFit(TextBlock *blk, TextLine *line) {
1576 double fontSize0, fontSize1;
1579 if (line->xMin < blk->xSpaceL ||
1580 line->xMax > blk->xSpaceR ||
1581 blk->xMin < line->xSpaceL ||
1582 blk->xMax > line->xSpaceR) {
1587 fontSize0 = blk->lines->fontSize;
1588 fontSize1 = line->fontSize;
1589 if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
1590 fontSize1 > blkMaxFontSizeRatio * fontSize0) {
1597 // Returns true if <blk0> and <blk1> can be merged into a single
1598 // block. Assumes the y coordinates are within range.
1599 GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) {
1600 double fontSize0, fontSize1;
1603 if (blk1->xMin < blk0->xSpaceL ||
1604 blk1->xMax > blk0->xSpaceR ||
1605 blk0->xMin < blk1->xSpaceL ||
1606 blk0->xMax > blk1->xSpaceR) {
1611 fontSize0 = blk0->lines->fontSize;
1612 fontSize1 = blk1->lines->fontSize;
1613 if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
1614 fontSize1 > blkMaxFontSizeRatio * fontSize0) {
1621 // Returns true if <blk> can be added to <flow>.
1622 GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) {
1625 // check whitespace above and below
1626 if (blk->yMin < flow->ySpaceT ||
1627 blk->yMax > flow->ySpaceB ||
1628 flow->yMin < blk->ySpaceT ||
1629 flow->yMax > blk->ySpaceB) {
1633 // check that block top edge is within +/- dy of flow top edge,
1634 // and that block bottom edge is above flow bottom edge + dy
1635 dy = flowMaxDeltaY * flow->blocks->maxFontSize;
1636 return blk->yMin > flow->yMin - dy &&
1637 blk->yMin < flow->yMin + dy &&
1638 blk->yMax < flow->yMax + dy;
1642 GBool TextPage::findText(Unicode *s, int len,
1643 GBool top, GBool bottom,
1644 double *xMin, double *yMin,
1645 double *xMax, double *yMax) {
1652 // scan all text on the page
1653 for (line = lines; line; line = line->pageNext) {
1655 // check: above top limit?
1656 if (!top && (line->yMax < *yMin ||
1657 (line->yMin < *yMin && line->xMax <= *xMin))) {
1661 // check: below bottom limit?
1662 if (!bottom && (line->yMin > *yMax ||
1663 (line->yMax > *yMax && line->xMin >= *xMax))) {
1667 // search each position in this line
1669 for (i = 0, p = line->text; i <= m - len; ++i, ++p) {
1671 x0 = (i == 0) ? line->xMin : line->xRight[i-1];
1672 x1 = line->xRight[i];
1673 x = 0.5 * (x0 + x1);
1675 // check: above top limit?
1676 if (!top && line->yMin < *yMin) {
1682 // check: below bottom limit?
1683 if (!bottom && line->yMax > *yMax) {
1689 // compare the strings
1690 for (j = 0; j < len; ++j) {
1691 #if 1 //~ this lowercases Latin A-Z only -- this will eventually be
1692 //~ extended to handle other character sets
1693 if (p[j] >= 0x41 && p[j] <= 0x5a) {
1698 if (s[j] >= 0x41 && s[j] <= 0x5a) {
1712 *xMax = line->xRight[i + len - 1];
1723 GString *TextPage::getText(double xMin, double yMin,
1724 double xMax, double yMax) {
1728 char space[8], eol[16], buf[8];
1729 int spaceLen, eolLen, len;
1730 TextLine *line, *prevLine;
1732 int firstCol, col, i;
1737 // get the output encoding
1738 if (!(uMap = globalParams->getTextEncoding())) {
1741 isUnicode = uMap->isUnicode();
1742 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
1743 eolLen = 0; // make gcc happy
1744 switch (globalParams->getTextEOL()) {
1746 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
1749 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1750 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
1753 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1757 // find the leftmost column
1759 for (line = lines; line; line = line->pageNext) {
1760 if (line->yMin > yMax) {
1763 if (line->yMax < yMin ||
1764 line->xMax < xMin ||
1765 line->xMin > xMax) {
1769 y = 0.5 * (line->yMin + line->yMax);
1770 if (y < yMin || y > yMax) {
1775 while (i < line->len) {
1776 x0 = (i==0) ? line->xMin : line->xRight[i-1];
1777 x1 = line->xRight[i];
1778 if (0.5 * (x0 + x1) > xMin) {
1783 if (i == line->len) {
1788 if (firstCol < 0 || col < firstCol) {
1797 for (line = lines; line; line = line->pageNext) {
1798 if (line->yMin > yMax) {
1801 if (line->yMax < yMin ||
1802 line->xMax < xMin ||
1803 line->xMin > xMax) {
1807 y = 0.5 * (line->yMin + line->yMax);
1808 if (y < yMin || y > yMax) {
1813 while (i < line->len) {
1814 x0 = (i==0) ? line->xMin : line->xRight[i-1];
1815 x1 = line->xRight[i];
1816 if (0.5 * (x0 + x1) > xMin) {
1821 if (i == line->len) {
1826 if (line->col[i] < col ||
1829 prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) {
1830 s->append(eol, eolLen);
1836 // line this block up with the correct column
1837 for (; col < line->col[i]; ++col) {
1838 s->append(space, spaceLen);
1841 // print the portion of the line
1842 for (; i < line->len; ++i) {
1844 x0 = (i==0) ? line->xMin : line->xRight[i-1];
1845 x1 = line->xRight[i];
1846 if (0.5 * (x0 + x1) > xMax) {
1850 len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
1851 s->append(buf, len);
1852 col += isUnicode ? 1 : len;
1857 s->append(eol, eolLen);
1865 GBool TextPage::findCharRange(int pos, int length,
1866 double *xMin, double *yMin,
1867 double *xMax, double *yMax) {
1874 //~ this doesn't correctly handle:
1875 //~ - ranges split across multiple lines (the highlighted region
1876 //~ is the bounding box of all the parts of the range)
1877 //~ - cases where characters don't convert one-to-one into Unicode
1879 for (line = lines; line; line = line->pageNext) {
1880 for (word = line->words; word; word = word->next) {
1881 if (pos < word->charPos + word->charLen &&
1882 word->charPos < pos + length) {
1883 i = pos - word->charPos;
1887 x = (i == 0) ? word->xMin : word->xRight[i - 1];
1888 if (first || x < *xMin) {
1891 i = pos + length - word->charPos;
1892 if (i >= word->len) {
1895 x = word->xRight[i];
1896 if (first || x > *xMax) {
1899 if (first || word->yMin < *yMin) {
1902 if (first || word->yMax > *yMax) {
1912 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
1915 char space[8], eol[16], eop[8], buf[8];
1916 int spaceLen, eolLen, eopLen, len;
1921 // get the output encoding
1922 if (!(uMap = globalParams->getTextEncoding())) {
1925 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
1926 eolLen = 0; // make gcc happy
1927 switch (globalParams->getTextEOL()) {
1929 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
1932 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1933 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
1936 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1939 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
1941 // output the page, maintaining the original physical layout
1942 if (physLayout || rawOrder) {
1944 for (line = lines; line; line = line->pageNext) {
1946 // line this block up with the correct column
1948 for (; col < line->col[0]; ++col) {
1949 (*outputFunc)(outputStream, space, spaceLen);
1954 for (i = 0; i < line->len; ++i) {
1955 len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
1956 (*outputFunc)(outputStream, buf, len);
1958 col += line->convertedLen;
1960 // print one or more returns if necessary
1963 line->pageNext->col[0] < col ||
1964 line->pageNext->yMin >
1965 line->yMax - lineOverlapSlack * line->fontSize) {
1967 // compute number of returns
1969 if (line->pageNext) {
1970 d += (int)((line->pageNext->yMin - line->yMax) /
1971 line->fontSize + 0.5);
1974 // various things (weird font matrices) can result in bogus
1975 // values here, so do a sanity check
1981 for (; d > 0; --d) {
1982 (*outputFunc)(outputStream, eol, eolLen);
1989 // output the page, "undoing" the layout
1991 for (flow = flows; flow; flow = flow->next) {
1992 for (line = flow->lines; line; line = line->flowNext) {
1994 if (line->flowNext && line->hyphenated) {
1997 for (i = 0; i < n; ++i) {
1998 len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
1999 (*outputFunc)(outputStream, buf, len);
2001 if (line->flowNext && !line->hyphenated) {
2002 (*outputFunc)(outputStream, space, spaceLen);
2005 (*outputFunc)(outputStream, eol, eolLen);
2006 (*outputFunc)(outputStream, eol, eolLen);
2011 (*outputFunc)(outputStream, eop, eopLen);
2012 (*outputFunc)(outputStream, eol, eolLen);
2017 void TextPage::startPage(GfxState *state) {
2020 pageWidth = state->getPageWidth();
2021 pageHeight = state->getPageHeight();
2023 pageWidth = pageHeight = 0;
2027 void TextPage::clear() {
2036 for (w1 = words; w1; w1 = w2) {
2041 for (f1 = flows; f1; f1 = f2) {
2046 deleteGList(fonts, TextFontInfo);
2054 words = wordPtr = NULL;
2057 fonts = new GList();
2062 //------------------------------------------------------------------------
2064 //------------------------------------------------------------------------
2066 static void outputToFile(void *stream, char *text, int len) {
2067 fwrite(text, 1, len, (FILE *)stream);
2070 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
2071 GBool rawOrderA, GBool append) {
2073 physLayout = physLayoutA;
2074 rawOrder = rawOrderA;
2080 if (!strcmp(fileName, "-")) {
2081 outputStream = stdout;
2083 // keep DOS from munging the end-of-line characters
2084 setmode(fileno(stdout), O_BINARY);
2086 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
2089 error(-1, "Couldn't open text file '%s'", fileName);
2093 outputFunc = &outputToFile;
2095 outputStream = NULL;
2098 // set up text object
2099 text = new TextPage(rawOrderA);
2102 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
2103 GBool physLayoutA, GBool rawOrderA) {
2105 outputStream = stream;
2107 physLayout = physLayoutA;
2108 rawOrder = rawOrderA;
2109 text = new TextPage(rawOrderA);
2113 TextOutputDev::~TextOutputDev() {
2116 ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
2118 fclose((FILE *)outputStream);
2125 void TextOutputDev::startPage(int pageNum, GfxState *state) {
2126 text->startPage(state);
2129 void TextOutputDev::endPage() {
2130 text->coalesce(physLayout);
2132 text->dump(outputStream, outputFunc, physLayout);
2136 void TextOutputDev::updateFont(GfxState *state) {
2137 text->updateFont(state);
2140 void TextOutputDev::beginString(GfxState *state, GString *s) {
2141 text->beginWord(state, state->getCurX(), state->getCurY());
2144 void TextOutputDev::endString(GfxState *state) {
2148 void TextOutputDev::drawChar(GfxState *state, double x, double y,
2149 double dx, double dy,
2150 double originX, double originY,
2151 CharCode c, Unicode *u, int uLen) {
2152 text->addChar(state, x, y, dx, dy, c, u, uLen);
2155 GBool TextOutputDev::findText(Unicode *s, int len,
2156 GBool top, GBool bottom,
2157 double *xMin, double *yMin,
2158 double *xMax, double *yMax) {
2159 return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax);
2162 GString *TextOutputDev::getText(double xMin, double yMin,
2163 double xMax, double yMax) {
2164 return text->getText(xMin, yMin, xMax, yMax);
2167 GBool TextOutputDev::findCharRange(int pos, int length,
2168 double *xMin, double *yMin,
2169 double *xMax, double *yMax) {
2170 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);