1 //========================================================================
5 // Copyright 1997 Derek B. Noonburg
7 //========================================================================
10 #pragma implementation
22 #include "FontEncoding.h"
23 #include "TextOutputDev.h"
25 #include "TextOutputFontInfo.h"
27 //------------------------------------------------------------------------
28 // Character substitutions
29 //------------------------------------------------------------------------
31 static char *isoLatin1Subst[] = {
37 "fi", "fl", // ligatures
38 "ff", "ffi", "ffl", // ligatures
46 "-", "-", // emdash, hyphen
47 "\"", "\"", // quotedblleft, quotedblright
52 static char *ascii7Subst[] = {
53 "A", "A", "A", "A", // A{acute,circumflex,dieresis,grave}
54 "A", "A", // A{ring,tilde}
57 "E", "E", "E", "E", // E{acute,circumflex,dieresis,grave}
58 "I", "I", "I", "I", // I{acute,circumflex,dieresis,grave}
61 "O", "O", "O", "O", // O{acute,circumflex,dieresis,grave}
62 "O", "O", // O{slash,tilde}
65 "U", "U", "U", "U", // U{acute,circumflex,dieresis,grave}
66 "Y", "Y", // T{acute,dieresis}
68 "a", "a", "a", "a", // a{acute,circumflex,dieresis,grave}
69 "a", "a", // a{ring,tilde}
72 "e", "e", "e", "e", // e{acute,circumflex,dieresis,grave}
73 "fi", "fl", // ligatures
74 "ff", "ffi", "ffl", // ligatures
76 "i", "i", "i", "i", // i{acute,circumflex,dieresis,grave}
79 "o", "o", "o", "o", // o{acute,circumflex,dieresis,grave}
80 "o", "o", // o{slash,tilde}
83 "u", "u", "u", "u", // u{acute,circumflex,dieresis,grave}
84 "y", "y", // t{acute,dieresis}
89 "-", "-", "-", // emdash, endash, hyphen
90 "\"", "\"", // quotedblleft, quotedblright
96 //------------------------------------------------------------------------
98 //------------------------------------------------------------------------
103 static Gushort japan12Map[96] = {
104 0x2120, 0x2120, 0x212a, 0x2149, 0x2174, 0x2170, 0x2173, 0x2175, // 00 .. 07
105 0x2147, 0x214a, 0x214b, 0x2176, 0x215c, 0x2124, 0x213e, 0x2123, // 08 .. 0f
106 0x213f, 0x2330, 0x2331, 0x2332, 0x2333, 0x2334, 0x2335, 0x2336, // 10 .. 17
107 0x2337, 0x2338, 0x2339, 0x2127, 0x2128, 0x2163, 0x2161, 0x2164, // 18 .. 1f
108 0x2129, 0x2177, 0x2341, 0x2342, 0x2343, 0x2344, 0x2345, 0x2346, // 20 .. 27
109 0x2347, 0x2348, 0x2349, 0x234a, 0x234b, 0x234c, 0x234d, 0x234e, // 28 .. 2f
110 0x234f, 0x2350, 0x2351, 0x2352, 0x2353, 0x2354, 0x2355, 0x2356, // 30 .. 37
111 0x2357, 0x2358, 0x2359, 0x235a, 0x214e, 0x216f, 0x214f, 0x2130, // 38 .. 3f
112 0x2132, 0x2146, 0x2361, 0x2362, 0x2363, 0x2364, 0x2365, 0x2366, // 40 .. 47
113 0x2367, 0x2368, 0x2369, 0x236a, 0x236b, 0x236c, 0x236d, 0x236e, // 48 .. 4f
114 0x236f, 0x2370, 0x2371, 0x2372, 0x2373, 0x2374, 0x2375, 0x2376, // 50 .. 57
115 0x2377, 0x2378, 0x2379, 0x237a, 0x2150, 0x2143, 0x2151, 0x2141 // 58 .. 5f
119 static Gushort japan12KanaMap1[97] = {
120 0x2131, 0x2121, 0x2123, 0x2156, 0x2157, 0x2122, 0x2126, 0x2572,
121 0x2521, 0x2523, 0x2525, 0x2527, 0x2529, 0x2563, 0x2565, 0x2567,
122 0x2543, 0x213c, 0x2522, 0x2524, 0x2526, 0x2528, 0x252a, 0x252b,
123 0x252d, 0x252f, 0x2531, 0x2533, 0x2535, 0x2537, 0x2539, 0x253b,
124 0x253d, 0x253f, 0x2541, 0x2544, 0x2546, 0x2548, 0x254a, 0x254b,
125 0x254c, 0x254d, 0x254e, 0x254f, 0x2552, 0x2555, 0x2558, 0x255b,
126 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2564, 0x2566, 0x2568,
127 0x2569, 0x256a, 0x256b, 0x256c, 0x256d, 0x256f, 0x2573, 0x212b,
128 0x212c, 0x212e, 0x2570, 0x2571, 0x256e, 0x2575, 0x2576, 0x2574,
129 0x252c, 0x252e, 0x2530, 0x2532, 0x2534, 0x2536, 0x2538, 0x253a,
130 0x253c, 0x253e, 0x2540, 0x2542, 0x2545, 0x2547, 0x2549, 0x2550,
131 0x2551, 0x2553, 0x2554, 0x2556, 0x2557, 0x2559, 0x255a, 0x255c,
136 static Gushort japan12KanaMap2[98] = {
137 0x212d, 0x212f, 0x216d, 0x214c, 0x214d, 0x2152, 0x2153, 0x2154,
138 0x2155, 0x2158, 0x2159, 0x215a, 0x215b, 0x213d, 0x2121, 0x2472,
139 0x2421, 0x2423, 0x2425, 0x2427, 0x2429, 0x2463, 0x2465, 0x2467,
140 0x2443, 0x2422, 0x2424, 0x2426, 0x2428, 0x242a, 0x242b, 0x242d,
141 0x242f, 0x2431, 0x2433, 0x2435, 0x2437, 0x2439, 0x243b, 0x243d,
142 0x243f, 0x2441, 0x2444, 0x2446, 0x2448, 0x244a, 0x244b, 0x244c,
143 0x244d, 0x244e, 0x244f, 0x2452, 0x2455, 0x2458, 0x245b, 0x245e,
144 0x245f, 0x2460, 0x2461, 0x2462, 0x2464, 0x2466, 0x2468, 0x2469,
145 0x246a, 0x246b, 0x246c, 0x246d, 0x246f, 0x2473, 0x2470, 0x2471,
146 0x246e, 0x242c, 0x242e, 0x2430, 0x2432, 0x2434, 0x2436, 0x2438,
147 0x243a, 0x243c, 0x243e, 0x2440, 0x2442, 0x2445, 0x2447, 0x2449,
148 0x2450, 0x2451, 0x2453, 0x2454, 0x2456, 0x2457, 0x2459, 0x245a,
152 static char *japan12Roman[10] = {
153 "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"
156 static char *japan12Abbrev1[6] = {
157 "mm", "cm", "km", "mg", "kg", "cc"
162 //------------------------------------------------------------------------
164 //------------------------------------------------------------------------
166 TextString::TextString(GfxState *state, GBool hexCodes1) {
169 state->transform(state->getCurX(), state->getCurY(), &x, &y);
170 h = state->getTransformedFontSize();
171 //~ yMin/yMax computation should use font ascent/descent values
173 yMax = yMin + 1.3 * h;
175 text = new GString();
179 hexCodes = hexCodes1;
182 TextString::~TextString() {
187 void TextString::addChar(GfxState *state, double x, double y,
188 double dx, double dy,
189 Guchar c, GBool useASCII7) {
190 char *charName, *sub;
195 i = text->getLength();
197 // append translated character(s) to string
200 if ((charName = state->getFont()->getCharName(c))) {
202 c1 = ascii7Encoding.getCharCode(charName);
204 c1 = isoLatin1Encoding.getCharCode(charName);
206 m = strlen(charName);
207 if (hexCodes && m == 3 &&
208 (charName[0] == 'B' || charName[0] == 'C' ||
209 charName[0] == 'G') &&
210 isxdigit(charName[1]) && isxdigit(charName[2])) {
211 sscanf(charName+1, "%x", &c1);
212 } else if (!hexCodes && m >= 2 && m <= 3 &&
213 isdigit(charName[0]) && isdigit(charName[1])) {
217 } else if (!hexCodes && m >= 3 && m <= 5 && isdigit(charName[1])) {
218 c1 = atoi(charName+1);
222 //~ this is a kludge -- is there a standard internal encoding
223 //~ used by all/most Type 1 fonts?
224 if (c1 == 262) // hyphen
226 else if (c1 == 266) // emdash
229 c1 = ascii7Encoding.getCharCode(isoLatin1Encoding.getCharName(c1));
233 sub = ascii7Subst[c1 - 128];
238 sub = isoLatin1Subst[c1 - 256];
248 text->append((char)c1);
252 // update position information
253 if (i+n > ((i+15) & ~15))
254 xRight = (double *)grealloc(xRight, ((i+n+15) & ~15) * sizeof(double));
257 for (j = 0; j < n; ++j)
258 xRight[i+j] = x + ((j+1) * dx) / n;
262 void TextString::addChar16(GfxState *state, double x, double y,
263 double dx, double dy,
264 int c, GfxFontCharSet16 charSet) {
272 i = text->getLength();
274 // convert the 16-bit character
279 // convert Adobe-Japan1-2 to JIS X 0208-1983
280 case font16AdobeJapan12:
283 c1 = 0x8080 + japan12Map[c];
284 } else if (c <= 632) {
288 c1 = 0x8080 + japan12Map[c - 230];
290 c1 = 0x8080 + japan12KanaMap1[c - 325];
294 c1 = 0x8080 + japan12KanaMap2[c - 501];
297 } else if (c <= 1124) {
300 c1 = 0xa1a1 + (c - 633);
302 c1 = 0xa2a1 + (c - 727);
304 c1 = 0xa2ba + (c - 741);
306 c1 = 0xa2ca + (c - 749);
308 c1 = 0xa2dc + (c - 756);
310 c1 = 0xa2f2 + (c - 771);
313 } else if (c <= 841) {
315 c1 = 0xa3b0 + (c - 780);
317 c1 = 0xa3c1 + (c - 790);
319 c1 = 0xa3e1 + (c - 816);
320 } else if (c <= 1010) {
322 c1 = 0xa4a1 + (c - 842);
324 c1 = 0xa5a1 + (c - 925);
327 c1 = 0xa6a1 + (c - 1011);
329 c1 = 0xa6c1 + (c - 1035);
331 c1 = 0xa7a1 + (c - 1059);
333 c1 = 0xa7d1 + (c - 1092);
335 } else if (c <= 4089) {
336 t1 = (c - 1125) / 94;
337 t2 = (c - 1125) % 94;
338 c1 = 0xb0a1 + (t1 << 8) + t2;
339 } else if (c <= 7477) {
340 t1 = (c - 4090) / 94;
341 t2 = (c - 4090) % 94;
342 c1 = 0xd0a1 + (t1 << 8) + t2;
343 } else if (c <= 7554) {
345 } else if (c <= 7563) { // circled Arabic numbers 1..9
346 c1 = 0xa3b1 + (c - 7555);
347 } else if (c <= 7574) { // circled Arabic numbers 10..20
349 sub[0] = 0xa3b0 + (t1 / 10);
350 sub[1] = 0xa3b0 + (t1 % 10);
353 } else if (c <= 7584) { // Roman numbers I..X
354 for (p = japan12Roman[c - 7575], q = sub; *p; ++p, ++q) {
359 } else if (c <= 7632) {
362 } else if (c <= 7606) {
363 for (p = japan12Abbrev1[c - 7601], q = sub; *p; ++p, ++q) {
374 #endif // JAPANESE_SUPPORT
378 // append converted character to string
381 error(-1, "Unsupported Adobe-Japan1-2 character: %d", c);
386 text->append(c1 >> 8);
387 text->append(c1 & 0xff);
391 for (q = sub; *q; ++q) {
392 text->append(*q >> 8);
393 text->append(*q & 0xff);
398 // update position information
399 if (i+n > ((i+15) & ~15)) {
400 xRight = (double *)grealloc(xRight, ((i+n+15) & ~15) * sizeof(double));
405 for (j = 0; j < n; ++j) {
406 xRight[i+j] = x + dx;
411 //------------------------------------------------------------------------
413 //------------------------------------------------------------------------
415 TextPage::TextPage(GBool useASCII7, GBool rawOrder) {
416 this->useASCII7 = useASCII7;
417 this->rawOrder = rawOrder;
421 yxCur1 = yxCur2 = NULL;
424 TextPage::~TextPage() {
428 void TextPage::beginString(GfxState *state, GString *s, GBool hexCodes) {
429 curStr = new TextString(state, hexCodes);
432 void TextPage::addChar(GfxState *state, double x, double y,
433 double dx, double dy, Guchar c) {
434 double x1, y1, w1, h1, dx2, dy2;
438 state->transform(x, y, &x1, &y1);
439 state->textTransformDelta(state->getCharSpace(), 0, &dx2, &dy2);
442 state->transformDelta(dx, dy, &w1, &h1);
443 n = curStr->text->getLength();
445 x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) {
446 hexCodes = curStr->hexCodes;
448 beginString(state, NULL, hexCodes);
450 curStr->addChar(state, x1, y1, w1, h1, c, useASCII7);
453 void TextPage::addChar16(GfxState *state, double x, double y,
454 double dx, double dy, int c,
455 GfxFontCharSet16 charSet) {
456 double x1, y1, w1, h1, dx2, dy2;
460 state->transform(x, y, &x1, &y1);
461 state->textTransformDelta(state->getCharSpace(), 0, &dx2, &dy2);
464 state->transformDelta(dx, dy, &w1, &h1);
465 n = curStr->text->getLength();
467 x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) {
468 hexCodes = curStr->hexCodes;
470 beginString(state, NULL, hexCodes);
472 curStr->addChar16(state, x1, y1, w1, h1, c, charSet);
475 void TextPage::endString() {
479 // throw away zero-length strings -- they don't have valid xMin/xMax
480 // values, and they're useless anyway
481 if (curStr->text->getLength() == 0) {
488 if (curStr->yMax - curStr->yMin > 20) {
495 // insert string in y-major list
496 h = curStr->yMax - curStr->yMin;
497 y1 = curStr->yMin + 0.5 * h;
498 y2 = curStr->yMin + 0.8 * h;
502 } else if ((!yxCur1 ||
503 (y1 >= yxCur1->yMin &&
504 (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) &&
506 (y1 < yxCur2->yMin ||
507 (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) {
511 for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) {
512 if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin))
526 void TextPage::coalesce() {
527 TextString *str1, *str2;
531 #if 0 //~ for debugging
532 for (str1 = yxStrings; str1; str1 = str1->yxNext) {
533 printf("x=%3d..%3d y=%3d..%3d size=%2d '%s'\n",
534 (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax,
535 (int)(str1->yMax - str1->yMin), str1->text->getCString());
537 printf("\n------------------------------------------------------------\n\n");
540 while (str1 && (str2 = str1->yxNext)) {
541 space = str1->yMax - str1->yMin;
542 d = str2->xMin - str1->xMax;
545 ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) ||
546 (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) ||
547 (!rawOrder && str2->yMin < str1->yMax)) &&
548 d > -0.1 * space && d < 0.2 * space) {
551 ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) ||
552 (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) ||
553 (!rawOrder && str2->yMin < str1->yMax)) &&
554 d > -0.5 * space && d < space) {
556 n = str1->text->getLength();
558 str1->text->append(' ');
559 str1->text->append(str2->text);
560 str1->xRight = (double *)
561 grealloc(str1->xRight, str1->text->getLength() * sizeof(double));
563 str1->xRight[n++] = str2->xMin;
564 for (i = 0; i < str2->text->getLength(); ++i)
565 str1->xRight[n++] = str2->xRight[i];
566 if (str2->xMax > str1->xMax)
567 str1->xMax = str2->xMax;
568 if (str2->yMax > str1->yMax)
569 str1->yMax = str2->yMax;
570 str1->yxNext = str2->yxNext;
578 GBool TextPage::findText(char *s, GBool top, GBool bottom,
579 double *xMin, double *yMin,
580 double *xMax, double *yMax) {
586 // scan all strings on page
588 for (str = yxStrings; str; str = str->yxNext) {
590 // check: above top limit?
591 if (!top && (str->yMax < *yMin ||
592 (str->yMin < *yMin && str->xMax <= *xMin)))
595 // check: below bottom limit?
596 if (!bottom && (str->yMin > *yMax ||
597 (str->yMax > *yMax && str->xMin >= *xMax)))
600 // search each position in this string
601 m = str->text->getLength();
602 for (i = 0, p = str->text->getCString(); i <= m - n; ++i, ++p) {
604 // check: above top limit?
605 if (!top && str->yMin < *yMin) {
606 x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2;
611 // check: below bottom limit?
612 if (!bottom && str->yMax > *yMax) {
613 x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2;
618 // compare the strings
619 for (p1 = p, q = s; *q; ++p1, ++q) {
620 if (tolower(*p1) != tolower(*q))
626 *xMin = (i == 0) ? str->xMin : str->xRight[i-1];
627 *xMax = str->xRight[i+n-1];
637 GString *TextPage::getText(double xMin, double yMin,
638 double xMax, double yMax) {
641 double x0, x1, x2, y;
649 for (str1 = yxStrings; str1; str1 = str1->yxNext) {
650 y = 0.5 * (str1->yMin + str1->yMax);
653 if (y > yMin && str1->xMin < xMax && str1->xMax > xMin) {
654 x0 = x1 = x2 = str1->xMin;
655 for (i1 = 0; i1 < str1->text->getLength(); ++i1) {
656 x0 = (i1==0) ? str1->xMin : str1->xRight[i1-1];
657 x1 = str1->xRight[i1];
658 if (0.5 * (x0 + x1) >= xMin)
661 for (i2 = str1->text->getLength() - 1; i2 > i1; --i2) {
662 x1 = (i2==0) ? str1->xMin : str1->xRight[i2-1];
663 x2 = str1->xRight[i2];
664 if (0.5 * (x1 + x2) <= xMax)
667 if (s->getLength() > 0) {
668 if (x0 < xPrev || str1->yMin > yPrev) {
675 s->append(str1->text->getCString() + i1, i2 - i1 + 1);
685 void TextPage::dump(FILE *f) {
686 TextString *str1, *str2, *str3;
691 // build x-major list
693 for (str1 = yxStrings; str1; str1 = str1->yxNext) {
694 for (str2 = NULL, str3 = xyStrings;
696 str2 = str3, str3 = str3->xyNext) {
697 if (str1->xMin < str3->xMin ||
698 (str1->xMin == str3->xMin && str1->yMin < str3->yMin))
708 // do column assignment
709 for (str1 = xyStrings; str1; str1 = str1->xyNext) {
711 for (str2 = xyStrings; str2 != str1; str2 = str2->xyNext) {
712 if (str1->xMin >= str2->xMax) {
713 col2 = str2->col + str2->text->getLength() + 4;
716 } else if (str1->xMin > str2->xMin) {
718 (int)(((str1->xMin - str2->xMin) / (str2->xMax - str2->xMin)) *
719 str2->text->getLength());
728 #if 0 //~ for debugging
729 fprintf(f, "~~~~~~~~~~\n");
730 for (str1 = yxStrings; str1; str1 = str1->yxNext) {
731 fprintf(f, "(%4d,%4d) - (%4d,%4d) [%3d] %s\n",
732 (int)str1->xMin, (int)str1->yMin, (int)str1->xMax, (int)str1->yMax,
733 str1->col, str1->text->getCString());
735 fprintf(f, "~~~~~~~~~~\n");
740 yMax = yxStrings ? yxStrings->yMax : 0;
741 for (str1 = yxStrings; str1; str1 = str1->yxNext) {
743 // line this string up with the correct column
744 if (rawOrder && col1 == 0) {
747 for (; col1 < str1->col; ++col1) {
753 fputs(str1->text->getCString(), f);
756 col1 += str1->text->getLength();
758 // update yMax for this line
759 if (str1->yMax > yMax)
762 // if we've hit the end of the line...
764 if (!(str1->yxNext &&
765 !(rawOrder && str1->yxNext->yMax < str1->yMin) &&
766 str1->yxNext->yMin < str1->yMax &&
767 str1->yxNext->xMin >= str1->xMax)) {
769 if (!(str1->yxNext &&
770 !(rawOrder && str1->yxNext->yMax < str1->yMin) &&
771 str1->yxNext->yMin < 0.2*str1->yMin + 0.8*str1->yMax &&
772 str1->yxNext->xMin >= str1->xMax)) {
778 // print extra vertical space if necessary
781 // find yMin for next line
782 yMin = str1->yxNext->yMin;
783 for (str2 = str1->yxNext; str2; str2 = str2->yxNext) {
784 if (str2->yMin < yMin)
786 if (!(str2->yxNext && str2->yxNext->yMin < str2->yMax &&
787 str2->yxNext->xMin >= str2->xMax))
792 d = (int)((yMin - yMax) / (str1->yMax - str1->yMin) + 0.5);
793 if (rawOrder && d > 2) {
801 // set up for next line
803 yMax = str1->yxNext ? str1->yxNext->yMax : 0;
808 void TextPage::clear() {
815 for (p1 = yxStrings; p1; p1 = p2) {
821 yxCur1 = yxCur2 = NULL;
824 //------------------------------------------------------------------------
826 //------------------------------------------------------------------------
828 TextOutputDev::TextOutputDev(char *fileName, GBool useASCII7, GBool rawOrder) {
830 this->rawOrder = rawOrder;
836 if (!strcmp(fileName, "-")) {
838 } else if ((f = fopen(fileName, "w"))) {
841 error(-1, "Couldn't open text file '%s'", fileName);
849 // set up text object
850 text = new TextPage(useASCII7, rawOrder);
853 TextOutputDev::~TextOutputDev() {
860 void TextOutputDev::startPage(int pageNum, GfxState *state) {
864 void TextOutputDev::endPage() {
874 void TextOutputDev::updateFont(GfxState *state) {
879 // look for hex char codes in subsetted font
881 if ((font = state->getFont()) && !font->is16Bit()) {
882 for (c = 0; c < 256; ++c) {
883 if ((charName = font->getCharName(c))) {
884 if ((charName[0] == 'B' || charName[0] == 'C' ||
885 charName[0] == 'G') &&
886 strlen(charName) == 3 &&
887 ((charName[1] >= 'a' && charName[1] <= 'f') ||
888 (charName[1] >= 'A' && charName[1] <= 'F') ||
889 (charName[2] >= 'a' && charName[2] <= 'f') ||
890 (charName[2] >= 'A' && charName[2] <= 'F'))) {
899 void TextOutputDev::beginString(GfxState *state, GString *s) {
900 text->beginString(state, s, hexCodes);
903 void TextOutputDev::endString(GfxState *state) {
907 void TextOutputDev::drawChar(GfxState *state, double x, double y,
908 double dx, double dy, Guchar c) {
909 text->addChar(state, x, y, dx, dy, c);
912 void TextOutputDev::drawChar16(GfxState *state, double x, double y,
913 double dx, double dy, int c) {
914 text->addChar16(state, x, y, dx, dy, c, state->getFont()->getCharSet16());
917 GBool TextOutputDev::findText(char *s, GBool top, GBool bottom,
918 double *xMin, double *yMin,
919 double *xMax, double *yMax) {
920 return text->findText(s, top, bottom, xMin, yMin, xMax, yMax);