1 //========================================================================
5 // Copyright 1997-2002 Glyph & Cog, LLC
7 //========================================================================
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
24 #include "GlobalParams.h"
25 #include "UnicodeMap.h"
27 #include "TextOutputDev.h"
30 // needed for setting type/creator of MacOS files
31 #include "ICSupport.h"
34 //------------------------------------------------------------------------
36 #define textOutSpace 0.2
37 #define textOutColSpace 0.2
39 //------------------------------------------------------------------------
41 struct TextOutColumnEdge {
45 //------------------------------------------------------------------------
47 //------------------------------------------------------------------------
57 TextString *strings; // list of strings in the block
58 TextBlock *next; // next block in line
59 TextBlock *xyNext; // next block on xyBlocks list
60 Unicode *text; // Unicode text of the block, including
61 // spaces between strings
62 double *xRight; // right-hand x coord of each char
63 int len; // total number of Unicode characters
64 int convertedLen; // total number of converted characters
65 int *col; // starting column number for each
69 TextBlock::TextBlock() {
78 TextBlock::~TextBlock() {
81 for (p1 = strings; p1; p1 = p2) {
90 //------------------------------------------------------------------------
92 //------------------------------------------------------------------------
105 TextLine::TextLine() {
110 TextLine::~TextLine() {
113 for (p1 = blocks; p1; p1 = p2) {
119 //------------------------------------------------------------------------
121 //------------------------------------------------------------------------
123 TextString::TextString(GfxState *state, double x0, double y0,
128 state->transform(x0, y0, &x, &y);
129 if ((font = state->getFont())) {
130 yMin = y - font->getAscent() * fontSize;
131 yMax = y - font->getDescent() * fontSize;
133 // this means that the PDF file draws text without a current font,
134 // which should never happen
135 yMin = y - 0.95 * fontSize;
136 yMax = y + 0.35 * fontSize;
139 // this is a sanity check for a case that shouldn't happen -- but
140 // if it does happen, we want to avoid dividing by zero later
152 TextString::~TextString() {
157 void TextString::addChar(GfxState *state, double x, double y,
158 double dx, double dy, Unicode u) {
161 text = (Unicode *)grealloc(text, size * sizeof(Unicode));
162 xRight = (double *)grealloc(xRight, size * sizeof(double));
168 xMax = xRight[len] = x + dx;
172 //------------------------------------------------------------------------
174 //------------------------------------------------------------------------
176 TextPage::TextPage(GBool rawOrderA) {
177 rawOrder = rawOrderA;
181 xyCur1 = xyCur2 = NULL;
187 TextPage::~TextPage() {
191 void TextPage::updateFont(GfxState *state) {
195 int code, mCode, letterCode, anyCode;
198 // adjust the font size
199 fontSize = state->getTransformedFontSize();
200 if ((font = state->getFont()) && font->getType() == fontType3) {
201 // This is a hack which makes it possible to deal with some Type 3
202 // fonts. The problem is that it's impossible to know what the
203 // base coordinate system used in the font is without actually
204 // rendering the font. This code tries to guess by looking at the
205 // width of the character 'm' (which breaks if the font is a
206 // subset that doesn't contain 'm').
207 mCode = letterCode = anyCode = -1;
208 for (code = 0; code < 256; ++code) {
209 name = ((Gfx8BitFont *)font)->getCharName(code);
210 if (name && name[0] == 'm' && name[1] == '\0') {
213 if (letterCode < 0 && name && name[1] == '\0' &&
214 ((name[0] >= 'A' && name[0] <= 'Z') ||
215 (name[0] >= 'a' && name[0] <= 'z'))) {
218 if (anyCode < 0 && name && ((Gfx8BitFont *)font)->getWidth(code) > 0) {
223 (w = ((Gfx8BitFont *)font)->getWidth(mCode)) > 0) {
224 // 0.6 is a generic average 'm' width -- yes, this is a hack
226 } else if (letterCode >= 0 &&
227 (w = ((Gfx8BitFont *)font)->getWidth(letterCode)) > 0) {
228 // even more of a hack: 0.5 is a generic letter width
230 } else if (anyCode >= 0 &&
231 (w = ((Gfx8BitFont *)font)->getWidth(anyCode)) > 0) {
232 // better than nothing: 0.5 is a generic character width
235 fm = font->getFontMatrix();
237 fontSize *= fabs(fm[3] / fm[0]);
242 void TextPage::beginString(GfxState *state, double x0, double y0) {
243 // This check is needed because Type 3 characters can contain
244 // text-drawing operations.
250 curStr = new TextString(state, x0, y0, fontSize);
253 void TextPage::addChar(GfxState *state, double x, double y,
254 double dx, double dy, Unicode *u, int uLen) {
255 double x1, y1, w1, h1, dx2, dy2;
258 state->transform(x, y, &x1, &y1);
259 if (x1 < 0 || x1 > state->getPageWidth() ||
260 y1 < 0 || y1 > state->getPageHeight()) {
263 state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
267 state->transformDelta(dx, dy, &w1, &h1);
268 if (!globalParams->getTextKeepTinyChars() &&
269 fabs(w1) < 3 && fabs(h1) < 3) {
270 if (++nTinyChars > 20000) {
275 if (n > 0 && x1 - curStr->xRight[n-1] >
276 0.1 * (curStr->yMax - curStr->yMin)) {
277 // large char spacing is sometimes used to move text around
279 beginString(state, x, y);
281 if (uLen == 1 && u[0] == (Unicode)0x20 &&
282 w1 > 0.5 * (curStr->yMax - curStr->yMin)) {
283 // large word spacing is sometimes used to move text around
290 for (i = 0; i < uLen; ++i) {
291 curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
295 void TextPage::endString() {
296 // This check is needed because Type 3 characters can contain
297 // text-drawing operations.
307 void TextPage::addString(TextString *str) {
310 // throw away zero-length strings -- they don't have valid xMin/xMax
311 // values, and they're useless anyway
317 // insert string in xy list
321 } else if ((!xyCur1 || xyBefore(xyCur1, str)) &&
322 (!xyCur2 || xyBefore(str, xyCur2))) {
325 } else if (xyCur1 && xyBefore(xyCur1, str)) {
326 for (p1 = xyCur1, p2 = xyCur2; p2; p1 = p2, p2 = p2->next) {
327 if (xyBefore(str, p2)) {
333 for (p1 = NULL, p2 = xyStrings; p2; p1 = p2, p2 = p2->next) {
334 if (xyBefore(str, p2)) {
349 void TextPage::coalesce() {
350 TextLine *line, *line0;
351 TextBlock *yxBlocks, *xyBlocks, *blk, *blk0, *blk1, *blk2;
352 TextString *str0, *str1, *str2, *str3, *str4;
353 TextString *str1prev, *str2prev, *str3prev;
354 TextOutColumnEdge *edges;
358 int edgesLength, edgesSize;
359 double x, yMin, yMax;
360 double space, fit1, fit2, h;
364 #if 0 //~ for debugging
365 for (str1 = xyStrings; str1; str1 = str1->next) {
366 printf("x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
367 str1->xMin, str1->xMax, str1->yMin, str1->yMax,
368 (str1->yMax - str1->yMin));
369 for (i = 0; i < str1->len; ++i) {
370 fputc(str1->text[i] & 0xff, stdout);
374 printf("\n------------------------------------------------------------\n\n");
377 // build the list of column edges
379 edgesLength = edgesSize = 0;
381 for (str1prev = NULL, str1 = xyStrings;
383 str1prev = str1, str1 = str1->next) {
387 h = str1->yMax - str1->yMin;
388 if (str1prev && (str1->xMin - str1prev->xMax) / h < textOutColSpace) {
394 for (str2prev = str1, str2 = str1->next;
396 str2prev = str2, str2 = str2->next) {
397 h = str2->yMax - str2->yMin;
399 (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
400 fabs(str2->xMin - x) < 0.5 &&
401 str2->yMin - yMax < 0.3 * h &&
402 yMin - str2->yMax < 0.3 * h) {
407 if (str2->yMin < yMin) {
410 if (str2->yMax > yMax) {
413 str2->marked = gTrue;
414 for (str3prev = str1, str3 = str1->next;
416 str3prev = str3, str3 = str3->next) {
417 h = str3->yMax - str3->yMin;
419 (str3->xMin - str3prev->xMax) / h > textOutColSpace &&
420 fabs(str3->xMin - x) < 0.5 &&
421 str3->yMin - yMax < 0.3 * h &&
422 yMin - str3->yMax < 0.3 * h) {
427 if (str3->yMin < yMin) {
430 if (str3->yMax > yMax) {
433 str3->marked = gTrue;
435 for (str2prev = str1, str2 = str1->next;
437 str2prev = str2, str2 = str2->next) {
438 h = str2->yMax - str2->yMin;
440 (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
441 fabs(str2->xMin - x) < 0.5 &&
442 str2->yMin - yMax < 0.3 * h &&
443 yMin - str2->yMax < 0.3 * h) {
444 if (str2->yMin < yMin) {
447 if (str2->yMax > yMax) {
450 str2->marked = gTrue;
455 if (edgesLength == edgesSize) {
456 edgesSize = edgesSize ? 2 * edgesSize : 16;
457 edges = (TextOutColumnEdge *)
458 grealloc(edges, edgesSize * sizeof(TextOutColumnEdge));
460 edges[edgesLength].x = x;
461 edges[edgesLength].y0 = yMin;
462 edges[edgesLength].y1 = yMax;
465 str2->marked = gFalse;
468 str1->marked = gTrue;
472 #if 0 //~ for debugging
473 printf("column edges:\n");
474 for (i = 0; i < edgesLength; ++i) {
475 printf("%d: x=%.2f y0=%.2f y1=%.2f\n",
476 i, edges[i].x, edges[i].y0, edges[i].y1);
478 printf("\n------------------------------------------------------------\n\n");
488 xyStrings = xyStrings->next;
490 blk = new TextBlock();
492 blk->xMin = str0->xMin;
493 blk->xMax = str0->xMax;
494 blk->yMin = str0->yMin;
495 blk->yMax = str0->yMax;
499 fit1 = coalesceFit(str0, str2);
501 // look for best-fitting string
502 space = str0->yMax - str0->yMin;
503 for (str3 = xyStrings, str4 = xyStrings->next;
504 str4 && str4->xMin - str0->xMax <= space;
505 str3 = str4, str4 = str4->next) {
506 fit2 = coalesceFit(str0, str4);
515 // no fit - we're done with this block
519 // if we've hit a column edge we're done with this block
521 for (i = 0; i < edgesLength; ++i) {
522 if (str0->xMax < edges[i].x + 0.5 && edges[i].x - 0.5 < str2->xMin &&
523 str0->yMin < edges[i].y1 && str0->yMax > edges[i].y0 &&
524 str2->yMin < edges[i].y1 && str2->yMax > edges[i].y0) {
528 if (i < edgesLength) {
534 str1->next = str2->next;
536 xyStrings = str2->next;
540 if (str2->xMax > blk->xMax) {
541 blk->xMax = str2->xMax;
543 if (str2->yMin < blk->yMin) {
544 blk->yMin = str2->yMin;
546 if (str2->yMax > blk->yMax) {
547 blk->yMax = str2->yMax;
552 // insert block on list
554 // insert block on list in yx order
555 for (blk1 = NULL, blk2 = yxBlocks;
556 blk2 && !yxBefore(blk, blk2);
557 blk1 = blk2, blk2 = blk2->next) ;
570 // the strings are now owned by the lines/blocks tree
573 // build the block text
574 uMap = globalParams->getTextEncoding();
575 isUnicode = uMap ? uMap->isUnicode() : gFalse;
576 for (blk = yxBlocks; blk; blk = blk->next) {
578 for (str1 = blk->strings; str1; str1 = str1->next) {
579 blk->len += str1->len;
580 if (str1->next && str1->next->xMin - str1->xMax >
581 textOutSpace * (str1->yMax - str1->yMin)) {
582 str1->spaceAfter = gTrue;
585 str1->spaceAfter = gFalse;
588 blk->text = (Unicode *)gmalloc(blk->len * sizeof(Unicode));
589 blk->xRight = (double *)gmalloc(blk->len * sizeof(double));
590 blk->col = (int *)gmalloc(blk->len * sizeof(int));
592 for (str1 = blk->strings; str1; str1 = str1->next) {
593 for (j = 0; j < str1->len; ++j) {
594 blk->text[i] = str1->text[j];
595 blk->xRight[i] = str1->xRight[j];
598 if (str1->spaceAfter) {
599 blk->text[i] = (Unicode)0x0020;
600 blk->xRight[i] = str1->next->xMin;
604 blk->convertedLen = 0;
605 for (j = 0; j < blk->len; ++j) {
606 blk->col[j] = blk->convertedLen;
610 blk->convertedLen += uMap->mapUnicode(blk->text[j], buf, sizeof(buf));
618 #if 0 //~ for debugging
619 for (blk = yxBlocks; blk; blk = blk->next) {
620 printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
621 blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
623 for (str = blk->strings; str; str = str->next) {
624 printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f'",
625 str->xMin, str->xMax, str->yMin, str->yMax,
626 (str->yMax - str->yMin));
627 for (i = 0; i < str->len; ++i) {
628 fputc(str->text[i] & 0xff, stdout);
630 if (str->spaceAfter) {
636 printf("\n------------------------------------------------------------\n\n");
644 yxBlocks = yxBlocks->next;
646 line = new TextLine();
648 line->yMin = blk0->yMin;
649 line->yMax = blk0->yMax;
652 // remove duplicated text (fake boldface, shadowed text)
653 h = blk0->yMax - blk0->yMin;
654 if (yxBlocks->len == blk0->len &&
655 !memcmp(yxBlocks->text, blk0->text,
656 yxBlocks->len * sizeof(Unicode)) &&
657 fabs(yxBlocks->yMin - blk0->yMin) / h < 0.2 &&
658 fabs(yxBlocks->yMax - blk0->yMax) / h < 0.2 &&
659 fabs(yxBlocks->xMin - blk0->xMin) / h < 0.2 &&
660 fabs(yxBlocks->xMax - blk0->xMax) / h < 0.2) {
662 yxBlocks = yxBlocks->next;
667 if (rawOrder && yxBlocks->yMax < blk0->yMin) {
670 if (yxBlocks->yMin > 0.2*blk0->yMin + 0.8*blk0->yMax ||
671 yxBlocks->xMin < blk0->xMax) {
675 yxBlocks = yxBlocks->next;
678 if (blk1->yMin < line->yMin) {
679 line->yMin = blk1->yMin;
681 if (blk1->yMax > line->yMax) {
682 line->yMax = blk1->yMax;
696 // sort the blocks into xy order
698 for (line = lines; line; line = line->next) {
699 for (blk = line->blocks; blk; blk = blk->next) {
700 for (blk1 = NULL, blk2 = xyBlocks;
701 blk2 && !xyBefore(blk, blk2);
702 blk1 = blk2, blk2 = blk2->xyNext) ;
712 #if 0 //~ for debugging
713 for (blk = xyBlocks; blk; blk = blk->xyNext) {
714 printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
715 blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
717 for (str = blk->strings; str; str = str->next) {
718 printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
719 str->xMin, str->xMax, str->yMin, str->yMax,
720 (str->yMax - str->yMin));
721 for (i = 0; i < str->len; ++i) {
722 fputc(str->text[i] & 0xff, stdout);
727 printf("\n------------------------------------------------------------\n\n");
730 // do column assignment
731 for (blk1 = xyBlocks; blk1; blk1 = blk1->xyNext) {
733 for (blk2 = xyBlocks; blk2 != blk1; blk2 = blk2->xyNext) {
734 if (blk1->xMin >= blk2->xMax) {
735 d = (int)((blk1->xMin - blk2->xMax) /
736 (0.4 * (blk1->yMax - blk1->yMin)));
740 col2 = blk2->col[0] + blk2->convertedLen + d;
744 } else if (blk1->xMin > blk2->xMin) {
745 for (i = 0; i < blk2->len && blk1->xMin >= blk2->xRight[i]; ++i) ;
752 for (j = 0; j < blk1->len; ++j) {
753 blk1->col[j] += col1;
757 #if 0 //~ for debugging
758 for (line = lines; line; line = line->next) {
760 for (blk = line->blocks; blk; blk = blk->next) {
761 printf("[block: col=%d, len=%d]\n", blk->col[0], blk->len);
763 for (str = blk->strings; str; str = str->next) {
764 printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
765 str->xMin, str->xMax, str->yMin, str->yMax,
766 (str->yMax - str->yMin));
767 for (i = 0; i < str->len; ++i) {
768 fputc(str->text[i] & 0xff, stdout);
770 if (str->spaceAfter) {
771 printf(" [space]\n");
777 printf("\n------------------------------------------------------------\n\n");
782 GBool TextPage::findText(Unicode *s, int len,
783 GBool top, GBool bottom,
784 double *xMin, double *yMin,
785 double *xMax, double *yMax) {
793 // scan all blocks on page
794 for (line = lines; line; line = line->next) {
795 for (blk = line->blocks; blk; blk = blk->next) {
797 // check: above top limit?
798 if (!top && (blk->yMax < *yMin ||
799 (blk->yMin < *yMin && blk->xMax <= *xMin))) {
803 // check: below bottom limit?
804 if (!bottom && (blk->yMin > *yMax ||
805 (blk->yMax > *yMax && blk->xMin >= *xMax))) {
809 // search each position in this block
811 for (i = 0, p = blk->text; i <= m - len; ++i, ++p) {
813 x0 = (i == 0) ? blk->xMin : blk->xRight[i-1];
817 // check: above top limit?
818 if (!top && blk->yMin < *yMin) {
824 // check: below bottom limit?
825 if (!bottom && blk->yMax > *yMax) {
831 // compare the strings
832 for (j = 0; j < len; ++j) {
833 #if 1 //~ this lowercases Latin A-Z only -- this will eventually be
834 //~ extended to handle other character sets
835 if (p[j] >= 0x41 && p[j] <= 0x5a) {
840 if (s[j] >= 0x41 && s[j] <= 0x5a) {
854 *xMax = blk->xRight[i + len - 1];
866 GString *TextPage::getText(double xMin, double yMin,
867 double xMax, double yMax) {
871 char space[8], eol[16], buf[8];
872 int spaceLen, eolLen, len;
876 int firstCol, col, i;
881 // get the output encoding
882 if (!(uMap = globalParams->getTextEncoding())) {
885 isUnicode = uMap->isUnicode();
886 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
887 eolLen = 0; // make gcc happy
888 switch (globalParams->getTextEOL()) {
890 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
893 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
894 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
897 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
901 // find the leftmost column
904 for (line = lines; line; line = line->next) {
905 if (line->yMin > yMax) {
908 if (line->yMax < yMin) {
912 for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
913 if (!blk || blk->xMin > xMax) {
917 y = 0.5 * (blk->yMin + blk->yMax);
918 if (y < yMin || y > yMax) {
928 x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
930 if (0.5 * (x0 + x1) > xMin) {
937 if (firstCol < 0 || col < firstCol) {
943 for (line = lines; line; line = line->next) {
944 if (line->yMin > yMax) {
947 if (line->yMax < yMin) {
951 for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
952 if (!blk || blk->xMin > xMax) {
956 y = 0.5 * (blk->yMin + blk->yMax);
957 if (y < yMin || y > yMax) {
963 x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
965 if (0.5 * (x0 + x1) > xMin) {
975 // line this block up with the correct column
976 for (; col < blk->col[i]; ++col) {
977 s->append(space, spaceLen);
981 for (; i < blk->len; ++i) {
983 x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
985 if (0.5 * (x0 + x1) > xMax) {
989 len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
991 col += isUnicode ? 1 : len;
1001 } while (blk && blk->xMin < xMax);
1004 s->append(eol, eolLen);
1013 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) {
1015 char space[8], eol[16], eop[8], buf[8];
1016 int spaceLen, eolLen, eopLen, len;
1021 // get the output encoding
1022 if (!(uMap = globalParams->getTextEncoding())) {
1025 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
1026 eolLen = 0; // make gcc happy
1027 switch (globalParams->getTextEOL()) {
1029 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
1032 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1033 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
1036 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1039 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
1042 for (line = lines; line; line = line->next) {
1044 for (blk = line->blocks; blk; blk = blk->next) {
1046 // line this block up with the correct column
1047 if (rawOrder && col == 0) {
1050 for (; col < blk->col[0]; ++col) {
1051 (*outputFunc)(outputStream, space, spaceLen);
1056 for (i = 0; i < blk->len; ++i) {
1057 len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
1058 (*outputFunc)(outputStream, buf, len);
1060 col += blk->convertedLen;
1064 (*outputFunc)(outputStream, eol, eolLen);
1066 // print extra vertical space if necessary
1068 d = (int)((line->next->yMin - line->yMax) /
1069 (line->blocks->strings->yMax - lines->blocks->strings->yMin)
1071 // various things (weird font matrices) can result in bogus
1072 // values here, so do a sanity check
1073 if (rawOrder && d > 2) {
1075 } else if (!rawOrder && d > 5) {
1078 for (; d > 0; --d) {
1079 (*outputFunc)(outputStream, eol, eolLen);
1085 (*outputFunc)(outputStream, eol, eolLen);
1086 (*outputFunc)(outputStream, eop, eopLen);
1087 (*outputFunc)(outputStream, eol, eolLen);
1092 // Returns true if <str1> should be inserted before <str2> in xy
1094 GBool TextPage::xyBefore(TextString *str1, TextString *str2) {
1095 return str1->xMin < str2->xMin ||
1096 (str1->xMin == str2->xMin && str1->yMin < str2->yMin);
1099 // Returns true if <blk1> should be inserted before <blk2> in xy
1101 GBool TextPage::xyBefore(TextBlock *blk1, TextBlock *blk2) {
1102 return blk1->xMin < blk2->xMin ||
1103 (blk1->xMin == blk2->xMin && blk1->yMin < blk2->yMin);
1106 // Returns true if <blk1> should be inserted before <blk2> in yx
1107 // order, allowing a little slack for vertically overlapping text.
1108 GBool TextPage::yxBefore(TextBlock *blk1, TextBlock *blk2) {
1109 double h1, h2, overlap;
1111 h1 = blk1->yMax - blk1->yMin;
1112 h2 = blk2->yMax - blk2->yMin;
1113 overlap = ((blk1->yMax < blk2->yMax ? blk1->yMax : blk2->yMax) -
1114 (blk1->yMin > blk2->yMin ? blk1->yMin : blk2->yMin)) /
1115 (h1 < h2 ? h1 : h2);
1116 if (overlap > 0.6) {
1117 return blk1->xMin < blk2->xMin;
1119 return blk1->yMin < blk2->yMin;
1122 double TextPage::coalesceFit(TextString *str1, TextString *str2) {
1123 double h1, h2, w1, w2, r, overlap, spacing;
1125 h1 = str1->yMax - str1->yMin;
1126 h2 = str2->yMax - str2->yMin;
1127 w1 = str1->xMax - str1->xMin;
1128 w2 = str2->xMax - str2->xMin;
1130 if (r < (1.0 / 3.0) || r > 3) {
1133 overlap = ((str1->yMax < str2->yMax ? str1->yMax : str2->yMax) -
1134 (str1->yMin > str2->yMin ? str1->yMin : str2->yMin)) /
1135 (h1 < h2 ? h1 : h2);
1136 if (overlap < 0.5) {
1139 spacing = (str2->xMin - str1->xMax) / (h1 > h2 ? h1 : h2);
1140 if (spacing < -0.5) {
1143 // separate text that overlaps - duplicated text (so that fake
1144 // boldface and shadowed text can be cleanly removed)
1145 if ((str2->xMin - str1->xMax) / (w1 < w2 ? w1 : w2) < -0.7) {
1151 void TextPage::clear() {
1153 TextString *s1, *s2;
1160 for (p1 = lines; p1; p1 = p2) {
1164 } else if (xyStrings) {
1165 for (s1 = xyStrings; s1; s1 = s2) {
1171 xyCur1 = xyCur2 = NULL;
1177 //------------------------------------------------------------------------
1179 //------------------------------------------------------------------------
1181 static void outputToFile(void *stream, char *text, int len) {
1182 fwrite(text, 1, len, (FILE *)stream);
1185 TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) {
1187 rawOrder = rawOrderA;
1193 if (!strcmp(fileName, "-")) {
1194 outputStream = stdout;
1195 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
1198 error(-1, "Couldn't open text file '%s'", fileName);
1202 outputFunc = &outputToFile;
1204 outputStream = NULL;
1207 // set up text object
1208 text = new TextPage(rawOrder);
1211 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
1214 outputStream = stream;
1216 rawOrder = rawOrderA;
1217 text = new TextPage(rawOrder);
1221 TextOutputDev::~TextOutputDev() {
1224 ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
1226 fclose((FILE *)outputStream);
1233 void TextOutputDev::startPage(int pageNum, GfxState *state) {
1237 void TextOutputDev::endPage() {
1240 text->dump(outputStream, outputFunc);
1244 void TextOutputDev::updateFont(GfxState *state) {
1245 text->updateFont(state);
1248 void TextOutputDev::beginString(GfxState *state, GString *s) {
1249 text->beginString(state, state->getCurX(), state->getCurY());
1252 void TextOutputDev::endString(GfxState *state) {
1256 void TextOutputDev::drawChar(GfxState *state, double x, double y,
1257 double dx, double dy,
1258 double originX, double originY,
1259 CharCode c, Unicode *u, int uLen) {
1260 text->addChar(state, x, y, dx, dy, u, uLen);
1263 GBool TextOutputDev::findText(Unicode *s, int len,
1264 GBool top, GBool bottom,
1265 double *xMin, double *yMin,
1266 double *xMax, double *yMax) {
1267 return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax);
1270 GString *TextOutputDev::getText(double xMin, double yMin,
1271 double xMax, double yMax) {
1272 return text->getText(xMin, yMin, xMax, yMax);