//======================================================================== // // TextOutputDev.cc // // Copyright 1997 Derek B. Noonburg // //======================================================================== #ifdef __GNUC__ #pragma implementation #endif #include #include #include #include #include "GString.h" #include "gmem.h" #include "config.h" #include "Error.h" #include "GfxState.h" #include "GfxFont.h" #include "TextOutputDev.h" #include "TextOutputFontInfo.h" //------------------------------------------------------------------------ // Character substitutions //------------------------------------------------------------------------ static char *isoLatin1Subst[] = { "L", // Lslash "OE", // OE "S", // Scaron "Y", // Ydieresis "Z", // Zcaron "fi", // fi "fl", // fl "i", // dotlessi "l", // lslash "oe", // oe "s", // scaron "z", // zcaron "*", // bullet "...", // ellipsis "-", "-", // emdash, hyphen "\"", "\"", // quotedblleft, quotedblright "'", // quotesingle "TM" // trademark }; static char *ascii7Subst[] = { "A", "A", "A", "A", // A{acute,circumflex,dieresis,grave} "A", "A", // A{ring,tilde} "AE", // AE "C", // Ccedilla "E", "E", "E", "E", // E{acute,circumflex,dieresis,grave} "I", "I", "I", "I", // I{acute,circumflex,dieresis,grave} "L", // Lslash "N", // Ntilde "O", "O", "O", "O", // O{acute,circumflex,dieresis,grave} "O", "O", // O{slash,tilde} "OE", // OE "S", // Scaron "U", "U", "U", "U", // U{acute,circumflex,dieresis,grave} "Y", "Y", // T{acute,dieresis} "Z", // Zcaron "a", "a", "a", "a", // a{acute,circumflex,dieresis,grave} "a", "a", // a{ring,tilde} "ae", // ae "c", // ccedilla "e", "e", "e", "e", // e{acute,circumflex,dieresis,grave} "fi", // fi "fl", // fl "i", // dotlessi "i", "i", "i", "i", // i{acute,circumflex,dieresis,grave} "l", // lslash "n", // ntilde "o", "o", "o", "o", // o{acute,circumflex,dieresis,grave} "o", "o", // o{slash,tilde} "oe", // oe "s", // scaron "u", "u", "u", "u", // u{acute,circumflex,dieresis,grave} "y", "y", // t{acute,dieresis} "z", // zcaron "|", // brokenbar "*", // bullet "...", // ellipsis "-", "-", "-", // emdash, endash, hyphen "\"", "\"", // quotedblleft, quotedblright "'", // quotesingle "(R)", // registered "TM" // trademark }; //------------------------------------------------------------------------ // TextString //------------------------------------------------------------------------ TextString::TextString(GfxState *state, GBool hexCodes1) { double x, y, h; state->transform(state->getCurX(), state->getCurY(), &x, &y); h = state->getTransformedFontSize(); //~ yMin/yMax computation should use font ascent/descent values yMin = y - 0.95 * h; yMax = yMin + 1.3 * h; col = 0; text = new GString(); xRight = NULL; yxNext = NULL; xyNext = NULL; hexCodes = hexCodes1; } TextString::~TextString() { delete text; gfree(xRight); } void TextString::addChar(GfxState *state, double x, double y, double dx, double dy, Guchar c, GBool useASCII7) { char *charName, *sub; int c1; int i, j, n, m; // get current index i = text->getLength(); // append translated character(s) to string sub = NULL; n = 1; if ((charName = state->getFont()->getCharName(c))) { if (useASCII7) c1 = ascii7Encoding.getCharCode(charName); else c1 = isoLatin1Encoding.getCharCode(charName); if (c1 < 0) { m = strlen(charName); if (hexCodes && m == 3 && (charName[0] == 'B' || charName[0] == 'C' || charName[0] == 'G') && isxdigit(charName[1]) && isxdigit(charName[2])) { sscanf(charName+1, "%x", &c1); } else if (!hexCodes && m >= 2 && m <= 3 && isdigit(charName[0]) && isdigit(charName[1])) { c1 = atoi(charName); if (c1 >= 256) c1 = -1; } else if (!hexCodes && m >= 3 && m <= 5 && isdigit(charName[1])) { c1 = atoi(charName+1); if (c1 >= 256) c1 = -1; } //~ this is a kludge -- is there a standard internal encoding //~ used by all/most Type 1 fonts? if (c1 == 262) // hyphen c1 = 45; else if (c1 == 266) // emdash c1 = 208; if (useASCII7) c1 = ascii7Encoding.getCharCode(isoLatin1Encoding.getCharName(c1)); } if (useASCII7) { if (c1 >= 128) { sub = ascii7Subst[c1 - 128]; n = strlen(sub); } } else { if (c1 >= 256) { sub = isoLatin1Subst[c1 - 256]; n = strlen(sub); } } } else { c1 = -1; } if (sub) text->append(sub); else if (c1 >= 0) text->append((char)c1); else text->append(' '); // update position information if (i+n > ((i+15) & ~15)) xRight = (double *)grealloc(xRight, ((i+n+15) & ~15) * sizeof(double)); if (i == 0) xMin = x; for (j = 0; j < n; ++j) xRight[i+j] = x + ((j+1) * dx) / n; xMax = x + dx; } //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ TextPage::TextPage(GBool useASCII71) { useASCII7 = useASCII71; curStr = NULL; yxStrings = NULL; xyStrings = NULL; } TextPage::~TextPage() { clear(); } void TextPage::beginString(GfxState *state, GString *s, GBool hexCodes) { curStr = new TextString(state, hexCodes); } void TextPage::addChar(GfxState *state, double x, double y, double dx, double dy, Guchar c) { double x1, y1, w1, h1; state->transform(x, y, &x1, &y1); state->transformDelta(dx, dy, &w1, &h1); curStr->addChar(state, x1, y1, w1, h1, c, useASCII7); } void TextPage::endString() { TextString *p1, *p2; double h, y1, y2; // throw away zero-length strings -- they don't have valid xMin/xMax // values, and they're useless anyway if (curStr->text->getLength() == 0) { delete curStr; curStr = NULL; return; } #if 0 //~tmp if (curStr->yMax - curStr->yMin > 20) { delete curStr; curStr = NULL; return; } #endif // insert string in y-major list h = curStr->yMax - curStr->yMin; y1 = curStr->yMin + 0.5 * h; y2 = curStr->yMin + 0.8 * h; for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) { if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin)) break; } if (p1) p1->yxNext = curStr; else yxStrings = curStr; curStr->yxNext = p2; curStr = NULL; } void TextPage::coalesce() { TextString *str1, *str2; double space, d; int n, i; #if 0 //~ for debugging for (str1 = yxStrings; str1; str1 = str1->yxNext) { printf("x=%3d..%3d y=%3d..%3d size=%2d '%s'\n", (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax, (int)(str1->yMax - str1->yMin), str1->text->getCString()); } printf("\n------------------------------------------------------------\n\n"); #endif str1 = yxStrings; while (str1 && (str2 = str1->yxNext)) { space = str1->yMax - str1->yMin; d = str2->xMin - str1->xMax; #if 0 //~tmp if (str2->yMin < str1->yMax && d > -0.1 * space && d < 0.2 * space) { #else if (str2->yMin < str1->yMax && d > -0.5 * space && d < space) { #endif n = str1->text->getLength(); if (d > 0.1 * space) str1->text->append(' '); str1->text->append(str2->text); str1->xRight = (double *) grealloc(str1->xRight, str1->text->getLength() * sizeof(double)); if (d > 0.1 * space) str1->xRight[n++] = str2->xMin; for (i = 0; i < str2->text->getLength(); ++i) str1->xRight[n++] = str2->xRight[i]; if (str2->xMax > str1->xMax) str1->xMax = str2->xMax; if (str2->yMax > str1->yMax) str1->yMax = str2->yMax; str1->yxNext = str2->yxNext; delete str2; } else { str1 = str2; } } } GBool TextPage::findText(char *s, GBool top, GBool bottom, double *xMin, double *yMin, double *xMax, double *yMax) { TextString *str; char *p, *p1, *q; int n, m, i; double x; // scan all strings on page n = strlen(s); for (str = yxStrings; str; str = str->yxNext) { // check: above top limit? if (!top && (str->yMax < *yMin || (str->yMin < *yMin && str->xMax <= *xMin))) continue; // check: below bottom limit? if (!bottom && (str->yMin > *yMax || (str->yMax > *yMax && str->xMin >= *xMax))) return gFalse; // search each position in this string m = str->text->getLength(); for (i = 0, p = str->text->getCString(); i <= m - n; ++i, ++p) { // check: above top limit? if (!top && str->yMin < *yMin) { x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; if (x < *xMin) continue; } // check: below bottom limit? if (!bottom && str->yMax > *yMax) { x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; if (x > *xMax) return gFalse; } // compare the strings for (p1 = p, q = s; *q; ++p1, ++q) { if (tolower(*p1) != tolower(*q)) break; } // found it if (!*q) { *xMin = (i == 0) ? str->xMin : str->xRight[i-1]; *xMax = str->xRight[i+n-1]; *yMin = str->yMin; *yMax = str->yMax; return gTrue; } } } return gFalse; } GString *TextPage::getText(double xMin, double yMin, double xMax, double yMax) { GString *s; TextString *str1; double x0, x1, x2, y; double xPrev, yPrev; int i1, i2; GBool multiLine; s = new GString(); xPrev = yPrev = 0; multiLine = gFalse; for (str1 = yxStrings; str1; str1 = str1->yxNext) { y = 0.5 * (str1->yMin + str1->yMax); if (y > yMax) break; if (y > yMin && str1->xMin < xMax && str1->xMax > xMin) { x0 = x1 = x2 = str1->xMin; for (i1 = 0; i1 < str1->text->getLength(); ++i1) { x0 = (i1==0) ? str1->xMin : str1->xRight[i1-1]; x1 = str1->xRight[i1]; if (0.5 * (x0 + x1) >= xMin) break; } for (i2 = str1->text->getLength() - 1; i2 > i1; --i2) { x1 = (i2==0) ? str1->xMin : str1->xRight[i2-1]; x2 = str1->xRight[i2]; if (0.5 * (x1 + x2) <= xMax) break; } if (s->getLength() > 0) { if (x0 < xPrev || str1->yMin > yPrev) { s->append('\n'); multiLine = gTrue; } else { s->append(" "); } } s->append(str1->text->getCString() + i1, i2 - i1 + 1); xPrev = x2; yPrev = str1->yMax; } } if (multiLine) s->append('\n'); return s; } void TextPage::dump(FILE *f) { TextString *str1, *str2, *str3; double yMin, yMax; int col1, col2; double d; // build x-major list xyStrings = NULL; for (str1 = yxStrings; str1; str1 = str1->yxNext) { for (str2 = NULL, str3 = xyStrings; str3; str2 = str3, str3 = str3->xyNext) { if (str1->xMin < str3->xMin || (str1->xMin == str3->xMin && str1->yMin < str3->yMin)) break; } if (str2) str2->xyNext = str1; else xyStrings = str1; str1->xyNext = str3; } // do column assignment for (str1 = xyStrings; str1; str1 = str1->xyNext) { col1 = 0; for (str2 = xyStrings; str2 != str1; str2 = str2->xyNext) { if (str1->xMin >= str2->xMax) { col2 = str2->col + str2->text->getLength() + 4; if (col2 > col1) col1 = col2; } else if (str1->xMin > str2->xMin) { col2 = str2->col + (int)(((str1->xMin - str2->xMin) / (str2->xMax - str2->xMin)) * str2->text->getLength()); if (col2 > col1) { col1 = col2; } } } str1->col = col1; } #if 0 //~ for debugging fprintf(f, "~~~~~~~~~~\n"); for (str1 = yxStrings; str1; str1 = str1->yxNext) { fprintf(f, "(%4d,%4d) - (%4d,%4d) [%3d] %s\n", (int)str1->xMin, (int)str1->yMin, (int)str1->xMax, (int)str1->yMax, str1->col, str1->text->getCString()); } fprintf(f, "~~~~~~~~~~\n"); #endif // output col1 = 0; yMax = yxStrings ? yxStrings->yMax : 0; for (str1 = yxStrings; str1; str1 = str1->yxNext) { // line this string up with the correct column for (; col1 < str1->col; ++col1) fputc(' ', f); // print the string fputs(str1->text->getCString(), f); // increment column col1 += str1->text->getLength(); // update yMax for this line if (str1->yMax > yMax) yMax = str1->yMax; // if we've hit the end of the line... #if 0 //~ if (!(str1->yxNext && str1->yxNext->yMin < str1->yMax && str1->yxNext->xMin >= str1->xMax)) { #else if (!(str1->yxNext && str1->yxNext->yMin < 0.2*str1->yMin + 0.8*str1->yMax && str1->yxNext->xMin >= str1->xMax)) { #endif // print a return fputc('\n', f); // print extra vertical space if necessary if (str1->yxNext) { // find yMin for next line yMin = str1->yxNext->yMin; for (str2 = str1->yxNext; str2; str2 = str2->yxNext) { if (str2->yMin < yMin) yMin = str2->yMin; if (!(str2->yxNext && str2->yxNext->yMin < str2->yMax && str2->yxNext->xMin >= str2->xMax)) break; } // print the space d = (int)((yMin - yMax) / (str1->yMax - str1->yMin) + 0.5); for (; d > 0; --d) fputc('\n', f); } // set up for next line col1 = 0; yMax = str1->yxNext ? str1->yxNext->yMax : 0; } } } void TextPage::clear() { TextString *p1, *p2; if (curStr) { delete curStr; curStr = NULL; } for (p1 = yxStrings; p1; p1 = p2) { p2 = p1->yxNext; delete p1; } yxStrings = NULL; xyStrings = NULL; } //------------------------------------------------------------------------ // TextOutputDev //------------------------------------------------------------------------ TextOutputDev::TextOutputDev(char *fileName, GBool useASCII7) { text = NULL; ok = gTrue; // open file needClose = gFalse; if (fileName) { if (!strcmp(fileName, "-")) { f = stdout; } else if ((f = fopen(fileName, "w"))) { needClose = gTrue; } else { error(-1, "Couldn't open text file '%s'", fileName); ok = gFalse; return; } } else { f = NULL; } // set up text object text = new TextPage(useASCII7); } TextOutputDev::~TextOutputDev() { if (needClose) fclose(f); if (text) delete text; } void TextOutputDev::startPage(int pageNum, GfxState *state) { text->clear(); } void TextOutputDev::endPage() { text->coalesce(); if (f) { text->dump(f); fputc('\n', f); fputs("\f\n", f); fputc('\n', f); } } void TextOutputDev::updateFont(GfxState *state) { GfxFont *font; char *charName; int c; // look for hex char codes in subsetted font hexCodes = gFalse; if ((font = state->getFont())) { for (c = 0; c < 256; ++c) { if ((charName = font->getCharName(c))) { if ((charName[0] == 'B' || charName[0] == 'C' || charName[0] == 'G') && strlen(charName) == 3 && ((charName[1] >= 'a' && charName[1] <= 'f') || (charName[1] >= 'A' && charName[1] <= 'F') || (charName[2] >= 'a' && charName[2] <= 'f') || (charName[2] >= 'A' && charName[2] <= 'F'))) { hexCodes = gTrue; break; } } } } } void TextOutputDev::beginString(GfxState *state, GString *s) { text->beginString(state, s, hexCodes); } void TextOutputDev::endString(GfxState *state) { text->endString(); } void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, Guchar c) { text->addChar(state, x, y, dx, dy, c); } GBool TextOutputDev::findText(char *s, GBool top, GBool bottom, double *xMin, double *yMin, double *xMax, double *yMax) { return text->findText(s, top, bottom, xMin, yMin, xMax, yMax); }