X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=blobdiff_plain;f=pdf%2Fxpdf%2FTextOutputDev.cc;h=a492e7f3e178c2b1c4d90bb2953af97059acacb8;hb=2af881bd90a35b4f1343b027ba7c3c0464930fb1;hp=754049238ead3fbfed6512e52a67d4f9258d0264;hpb=7aac8dc8533347e21311b15186e0af82f1b22fd6;p=evince.git diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc index 75404923..a492e7f3 100644 --- a/pdf/xpdf/TextOutputDev.cc +++ b/pdf/xpdf/TextOutputDev.cc @@ -2,24 +2,34 @@ // // TextOutputDev.cc // -// Copyright 1997 Derek B. Noonburg +// Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== -#ifdef __GNUC__ +#include + +#ifdef USE_GCC_PRAGMAS #pragma implementation #endif #include #include #include +#include #include -#include "GString.h" +#ifdef WIN32 +#include // for O_BINARY +#include // for setmode +#endif #include "gmem.h" -#include "config.h" +#include "GString.h" +#include "GList.h" +#include "xpdfconfig.h" #include "Error.h" +#include "GlobalParams.h" +#include "UnicodeMap.h" +#include "UnicodeTypeTable.h" #include "GfxState.h" -#include "FontEncoding.h" #include "TextOutputDev.h" #ifdef MACOS @@ -27,1171 +37,3436 @@ #include "ICSupport.h" #endif -#include "TextOutputFontInfo.h" +//------------------------------------------------------------------------ +// parameters +//------------------------------------------------------------------------ + +// Each bucket in a text pool includes baselines within a range of +// this many points. +#define textPoolStep 4 + +// Inter-character space width which will cause addChar to start a new +// word. +#define minWordBreakSpace 0.1 + +// Negative inter-character space width, i.e., overlap, which will +// cause addChar to start a new word. +#define minDupBreakOverlap 0.2 + +// Max distance between baselines of two lines within a block, as a +// fraction of the font size. +#define maxLineSpacingDelta 1.5 + +// Max difference in primary font sizes on two lines in the same +// block. Delta1 is used when examining new lines above and below the +// current block; delta2 is used when examining text that overlaps the +// current block; delta3 is used when examining text to the left and +// right of the current block. +#define maxBlockFontSizeDelta1 0.05 +#define maxBlockFontSizeDelta2 0.6 +#define maxBlockFontSizeDelta3 0.2 + +// Max difference in font sizes inside a word. +#define maxWordFontSizeDelta 0.05 + +// Maximum distance between baselines of two words on the same line, +// e.g., distance between subscript or superscript and the primary +// baseline, as a fraction of the font size. +#define maxIntraLineDelta 0.5 + +// Minimum inter-word spacing, as a fraction of the font size. (Only +// used for raw ordering.) +#define minWordSpacing 0.15 + +// Maximum inter-word spacing, as a fraction of the font size. +#define maxWordSpacing 1.5 + +// Maximum horizontal spacing which will allow a word to be pulled +// into a block. +#define minColSpacing1 0.3 + +// Minimum spacing between columns, as a fraction of the font size. +#define minColSpacing2 1.0 + +// Maximum vertical spacing between blocks within a flow, as a +// multiple of the font size. +#define maxBlockSpacing 2.5 + +// Minimum spacing between characters within a word, as a fraction of +// the font size. +#define minCharSpacing -0.2 + +// Maximum spacing between characters within a word, as a fraction of +// the font size, when there is no obvious extra-wide character +// spacing. +#define maxCharSpacing 0.03 + +// When extra-wide character spacing is detected, the inter-character +// space threshold is set to the minimum inter-character space +// multiplied by this constant. +#define maxWideCharSpacingMul 1.3 + +// Max difference in primary,secondary coordinates (as a fraction of +// the font size) allowed for duplicated text (fake boldface, drop +// shadows) which is to be discarded. +#define dupMaxPriDelta 0.1 +#define dupMaxSecDelta 0.2 //------------------------------------------------------------------------ -// Character substitutions +// TextFontInfo //------------------------------------------------------------------------ -static char *generalSubstNames[] = { - "zerooldstyle", - "oneoldstyle", - "twooldstyle", - "threeoldstyle", - "fouroldstyle", - "fiveoldstyle", - "sixoldstyle", - "sevenoldstyle", - "eightoldstyle", - "nineoldstyle", - "oldstylezero", - "oldstyleone", - "oldstyletwo", - "oldstylethree", - "oldstylefour", - "oldstylefive", - "oldstylesix", - "oldstyleseven", - "oldstyleeight", - "oldstylenine" -}; +TextFontInfo::TextFontInfo(GfxState *state) { + gfxFont = state->getFont(); +#if TEXTOUT_WORD_LIST + fontName = (gfxFont && gfxFont->getOrigName()) + ? gfxFont->getOrigName()->copy() + : (GString *)NULL; +#endif +} -static FontEncoding generalSubstEncoding(generalSubstNames, - sizeof(generalSubstNames) / - sizeof(char *)); - -static char *generalSubst[] = { - "zero", - "one", - "two", - "three", - "four", - "five", - "six", - "seven", - "eight", - "nine", - "zero", - "one", - "two", - "three", - "four", - "five", - "six", - "seven", - "eight", - "nine" -}; +TextFontInfo::~TextFontInfo() { +#if TEXTOUT_WORD_LIST + if (fontName) { + delete fontName; + } +#endif +} -static char *ascii7Subst[] = { - "A", "A", "A", "A", // A{acute,circumflex,dieresis,grave} - "A", "A", // A{ring,tilde} - "AE", // AE - "C", // Ccedilla - "E", "E", "E", "E", // E{acute,circumflex,dieresis,grave} - "I", "I", "I", "I", // I{acute,circumflex,dieresis,grave} - "L", // Lslash - "N", // Ntilde - "O", "O", "O", "O", // O{acute,circumflex,dieresis,grave} - "O", "O", // O{slash,tilde} - "OE", // OE - "S", // Scaron - "U", "U", "U", "U", // U{acute,circumflex,dieresis,grave} - "Y", "Y", // T{acute,dieresis} - "Z", // Zcaron - "a", "a", "a", "a", // a{acute,circumflex,dieresis,grave} - "a", "a", // a{ring,tilde} - "ae", // ae - "c", // ccedilla - "e", "e", "e", "e", // e{acute,circumflex,dieresis,grave} - "fi", "fl", // ligatures - "ff", "ffi", "ffl", // ligatures - "i", // dotlessi - "i", "i", "i", "i", // i{acute,circumflex,dieresis,grave} - "l", // lslash - "n", // ntilde - "o", "o", "o", "o", // o{acute,circumflex,dieresis,grave} - "o", "o", // o{slash,tilde} - "oe", // oe - "s", // scaron - "u", "u", "u", "u", // u{acute,circumflex,dieresis,grave} - "y", "y", // t{acute,dieresis} - "z", // zcaron - "|", // brokenbar - "*", // bullet - "...", // ellipsis - "-", "-", "-", // emdash, endash, hyphen - "\"", "\"", // quotedblleft, quotedblright - "'", // quotesingle - "(R)", // registered - "TM" // trademark -}; +GBool TextFontInfo::matches(GfxState *state) { + return state->getFont() == gfxFont; +} -static char *isoLatin1Subst[] = { - "L", // Lslash - "OE", // OE - "S", // Scaron - "Y", // Ydieresis - "Z", // Zcaron - "fi", "fl", // ligatures - "ff", "ffi", "ffl", // ligatures - "i", // dotlessi - "l", // lslash - "oe", // oe - "s", // scaron - "z", // zcaron - "*", // bullet - "...", // ellipsis - "-", "-", // emdash, hyphen - "\"", "\"", // quotedblleft, quotedblright - "'", // quotesingle - "TM" // trademark -}; +//------------------------------------------------------------------------ +// TextWord +//------------------------------------------------------------------------ -static char *isoLatin2Subst[] = { - "fi", "fl", // ligatures - "ff", "ffi", "ffl", // ligatures - "*", // bullet - "...", // ellipsis - "-", "-", // emdash, hyphen - "\"", "\"", // quotedblleft, quotedblright - "'", // quotesingle - "TM" // trademark -}; +TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, + int charPosA, TextFontInfo *fontA, double fontSizeA) { + GfxFont *gfxFont; + double x, y, ascent, descent; + + rot = rotA; + charPos = charPosA; + charLen = 0; + font = fontA; + fontSize = fontSizeA; + state->transform(x0, y0, &x, &y); + if ((gfxFont = font->gfxFont)) { + ascent = gfxFont->getAscent() * fontSize; + descent = gfxFont->getDescent() * fontSize; + } else { + // this means that the PDF file draws text without a current font, + // which should never happen + ascent = 0.95 * fontSize; + descent = -0.35 * fontSize; + } + switch (rot) { + case 0: + yMin = y - ascent; + yMax = y - descent; + if (yMin == yMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + yMin = y; + yMax = y + 1; + } + base = y; + break; + case 1: + xMin = x + descent; + xMax = x + ascent; + if (xMin == xMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + xMin = x; + xMax = x + 1; + } + base = x; + break; + case 2: + yMin = y + descent; + yMax = y + ascent; + if (yMin == yMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + yMin = y; + yMax = y + 1; + } + base = y; + break; + case 3: + xMin = x - ascent; + xMax = x - descent; + if (xMin == xMax) { + // this is a sanity check for a case that shouldn't happen -- but + // if it does happen, we want to avoid dividing by zero later + xMin = x; + xMax = x + 1; + } + base = x; + break; + } + text = NULL; + edge = NULL; + len = size = 0; + spaceAfter = gFalse; + next = NULL; + +#if TEXTOUT_WORD_LIST + GfxRGB rgb; + + if ((state->getRender() & 3) == 1) { + state->getStrokeRGB(&rgb); + } else { + state->getFillRGB(&rgb); + } + colorR = rgb.r; + colorG = rgb.g; + colorB = rgb.b; +#endif +} + +TextWord::~TextWord() { + gfree(text); + gfree(edge); +} + +void TextWord::addChar(GfxState *state, double x, double y, + double dx, double dy, Unicode u) { + if (len == size) { + size += 16; + text = (Unicode *)grealloc(text, size * sizeof(Unicode)); + edge = (double *)grealloc(edge, (size + 1) * sizeof(double)); + } + text[len] = u; + switch (rot) { + case 0: + if (len == 0) { + xMin = x; + } + edge[len] = x; + xMax = edge[len+1] = x + dx; + break; + case 1: + if (len == 0) { + yMin = y; + } + edge[len] = y; + yMax = edge[len+1] = y + dy; + break; + case 2: + if (len == 0) { + xMax = x; + } + edge[len] = x; + xMin = edge[len+1] = x + dx; + break; + case 3: + if (len == 0) { + yMax = y; + } + edge[len] = y; + yMin = edge[len+1] = y + dy; + break; + } + ++len; +} + +void TextWord::merge(TextWord *word) { + int i; + + if (word->xMin < xMin) { + xMin = word->xMin; + } + if (word->yMin < yMin) { + yMin = word->yMin; + } + if (word->xMax > xMax) { + xMax = word->xMax; + } + if (word->yMax > yMax) { + yMax = word->yMax; + } + if (len + word->len > size) { + size = len + word->len; + text = (Unicode *)grealloc(text, size * sizeof(Unicode)); + edge = (double *)grealloc(edge, (size + 1) * sizeof(double)); + } + for (i = 0; i < word->len; ++i) { + text[len + i] = word->text[i]; + edge[len + i] = word->edge[i]; + } + edge[len + word->len] = word->edge[word->len]; + len += word->len; + charLen += word->charLen; +} + +inline int TextWord::primaryCmp(TextWord *word) { + double cmp; + + cmp = 0; // make gcc happy + switch (rot) { + case 0: + cmp = xMin - word->xMin; + break; + case 1: + cmp = yMin - word->yMin; + break; + case 2: + cmp = word->xMax - xMax; + break; + case 3: + cmp = word->yMax - yMax; + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +double TextWord::primaryDelta(TextWord *word) { + double delta; + + delta = 0; // make gcc happy + switch (rot) { + case 0: + delta = word->xMin - xMax; + break; + case 1: + delta = word->yMin - yMax; + break; + case 2: + delta = xMin - word->xMax; + break; + case 3: + delta = yMin - word->yMax; + break; + } + return delta; +} + +int TextWord::cmpYX(const void *p1, const void *p2) { + TextWord *word1 = *(TextWord **)p1; + TextWord *word2 = *(TextWord **)p2; + double cmp; + + cmp = word1->yMin - word2->yMin; + if (cmp == 0) { + cmp = word1->xMin - word2->xMin; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +#if TEXTOUT_WORD_LIST + +GString *TextWord::getText() { + GString *s; + UnicodeMap *uMap; + char buf[8]; + int n, i; + + s = new GString(); + if (!(uMap = globalParams->getTextEncoding())) { + return s; + } + for (i = 0; i < len; ++i) { + n = uMap->mapUnicode(text[i], buf, sizeof(buf)); + s->append(buf, n); + } + uMap->decRefCnt(); + return s; +} -static char **isoLatin5Subst = isoLatin1Subst; +#endif // TEXTOUT_WORD_LIST //------------------------------------------------------------------------ -// 16-bit fonts +// TextPool //------------------------------------------------------------------------ -#if JAPANESE_SUPPORT - -// CID 0 .. 96 -static Gushort japan12Map[96] = { - 0x2121, 0x2121, 0x212a, 0x2149, 0x2174, 0x2170, 0x2173, 0x2175, // 00 .. 07 - 0x2147, 0x214a, 0x214b, 0x2176, 0x215c, 0x2124, 0x213e, 0x2123, // 08 .. 0f - 0x213f, 0x2330, 0x2331, 0x2332, 0x2333, 0x2334, 0x2335, 0x2336, // 10 .. 17 - 0x2337, 0x2338, 0x2339, 0x2127, 0x2128, 0x2163, 0x2161, 0x2164, // 18 .. 1f - 0x2129, 0x2177, 0x2341, 0x2342, 0x2343, 0x2344, 0x2345, 0x2346, // 20 .. 27 - 0x2347, 0x2348, 0x2349, 0x234a, 0x234b, 0x234c, 0x234d, 0x234e, // 28 .. 2f - 0x234f, 0x2350, 0x2351, 0x2352, 0x2353, 0x2354, 0x2355, 0x2356, // 30 .. 37 - 0x2357, 0x2358, 0x2359, 0x235a, 0x214e, 0x216f, 0x214f, 0x2130, // 38 .. 3f - 0x2132, 0x2146, 0x2361, 0x2362, 0x2363, 0x2364, 0x2365, 0x2366, // 40 .. 47 - 0x2367, 0x2368, 0x2369, 0x236a, 0x236b, 0x236c, 0x236d, 0x236e, // 48 .. 4f - 0x236f, 0x2370, 0x2371, 0x2372, 0x2373, 0x2374, 0x2375, 0x2376, // 50 .. 57 - 0x2377, 0x2378, 0x2379, 0x237a, 0x2150, 0x2143, 0x2151, 0x2141 // 58 .. 5f -}; +TextPool::TextPool() { + minBaseIdx = 0; + maxBaseIdx = -1; + pool = NULL; + cursor = NULL; + cursorBaseIdx = -1; +} -// CID 325 .. 421 -static Gushort japan12KanaMap1[97] = { - 0x2131, 0x2121, 0x2123, 0x2156, 0x2157, 0x2122, 0x2126, 0x2572, - 0x2521, 0x2523, 0x2525, 0x2527, 0x2529, 0x2563, 0x2565, 0x2567, - 0x2543, 0x213c, 0x2522, 0x2524, 0x2526, 0x2528, 0x252a, 0x252b, - 0x252d, 0x252f, 0x2531, 0x2533, 0x2535, 0x2537, 0x2539, 0x253b, - 0x253d, 0x253f, 0x2541, 0x2544, 0x2546, 0x2548, 0x254a, 0x254b, - 0x254c, 0x254d, 0x254e, 0x254f, 0x2552, 0x2555, 0x2558, 0x255b, - 0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2564, 0x2566, 0x2568, - 0x2569, 0x256a, 0x256b, 0x256c, 0x256d, 0x256f, 0x2573, 0x212b, - 0x212c, 0x212e, 0x2570, 0x2571, 0x256e, 0x2575, 0x2576, 0x2574, - 0x252c, 0x252e, 0x2530, 0x2532, 0x2534, 0x2536, 0x2538, 0x253a, - 0x253c, 0x253e, 0x2540, 0x2542, 0x2545, 0x2547, 0x2549, 0x2550, - 0x2551, 0x2553, 0x2554, 0x2556, 0x2557, 0x2559, 0x255a, 0x255c, - 0x255d -}; +TextPool::~TextPool() { + int baseIdx; + TextWord *word, *word2; -// CID 501 .. 598 -static Gushort japan12KanaMap2[98] = { - 0x212d, 0x212f, 0x216d, 0x214c, 0x214d, 0x2152, 0x2153, 0x2154, - 0x2155, 0x2158, 0x2159, 0x215a, 0x215b, 0x213d, 0x2121, 0x2472, - 0x2421, 0x2423, 0x2425, 0x2427, 0x2429, 0x2463, 0x2465, 0x2467, - 0x2443, 0x2422, 0x2424, 0x2426, 0x2428, 0x242a, 0x242b, 0x242d, - 0x242f, 0x2431, 0x2433, 0x2435, 0x2437, 0x2439, 0x243b, 0x243d, - 0x243f, 0x2441, 0x2444, 0x2446, 0x2448, 0x244a, 0x244b, 0x244c, - 0x244d, 0x244e, 0x244f, 0x2452, 0x2455, 0x2458, 0x245b, 0x245e, - 0x245f, 0x2460, 0x2461, 0x2462, 0x2464, 0x2466, 0x2468, 0x2469, - 0x246a, 0x246b, 0x246c, 0x246d, 0x246f, 0x2473, 0x2470, 0x2471, - 0x246e, 0x242c, 0x242e, 0x2430, 0x2432, 0x2434, 0x2436, 0x2438, - 0x243a, 0x243c, 0x243e, 0x2440, 0x2442, 0x2445, 0x2447, 0x2449, - 0x2450, 0x2451, 0x2453, 0x2454, 0x2456, 0x2457, 0x2459, 0x245a, - 0x245c, 0x245d -}; + for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { + for (word = pool[baseIdx - minBaseIdx]; word; word = word2) { + word2 = word->next; + delete word; + } + } + gfree(pool); +} -static char *japan12Roman[10] = { - "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X" -}; +int TextPool::getBaseIdx(double base) { + int baseIdx; -static char *japan12Abbrev1[6] = { - "mm", "cm", "km", "mg", "kg", "cc" -}; + baseIdx = (int)(base / textPoolStep); + if (baseIdx < minBaseIdx) { + return minBaseIdx; + } + if (baseIdx > maxBaseIdx) { + return maxBaseIdx; + } + return baseIdx; +} -#endif +void TextPool::addWord(TextWord *word) { + TextWord **newPool; + int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx; + TextWord *w0, *w1; + + // expand the array if needed + wordBaseIdx = (int)(word->base / textPoolStep); + if (minBaseIdx > maxBaseIdx) { + minBaseIdx = wordBaseIdx - 128; + maxBaseIdx = wordBaseIdx + 128; + pool = (TextWord **)gmalloc((maxBaseIdx - minBaseIdx + 1) * + sizeof(TextWord *)); + for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { + pool[baseIdx - minBaseIdx] = NULL; + } + } else if (wordBaseIdx < minBaseIdx) { + newMinBaseIdx = wordBaseIdx - 128; + newPool = (TextWord **)gmalloc((maxBaseIdx - newMinBaseIdx + 1) * + sizeof(TextWord *)); + for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) { + newPool[baseIdx - newMinBaseIdx] = NULL; + } + memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool, + (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *)); + gfree(pool); + pool = newPool; + minBaseIdx = newMinBaseIdx; + } else if (wordBaseIdx > maxBaseIdx) { + newMaxBaseIdx = wordBaseIdx + 128; + pool = (TextWord **)grealloc(pool, (newMaxBaseIdx - minBaseIdx + 1) * + sizeof(TextWord *)); + for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) { + pool[baseIdx - minBaseIdx] = NULL; + } + maxBaseIdx = newMaxBaseIdx; + } -#if CHINESE_CNS_SUPPORT - -static Gushort cns13Map1[99] = { - // 0-98 - 0, 0xa140, 0xa149, 0xa1a8, 0xa1ad, 0xa243, 0xa248, 0xa1ae, - 0xa1a6, 0xa15d, 0xa15e, 0xa1af, 0xa1cf, 0xa141, 0xa1df, 0xa144, - 0xa241, 0xa2af, 0xa2b0, 0xa2b1, 0xa2b2, 0xa2b3, 0xa2b4, 0xa2b5, - 0xa2b6, 0xa2b7, 0xa2b8, 0xa147, 0xa146, 0xa1d5, 0xa1d7, 0xa1d6, - 0xa148, 0xa249, 0xa2cf, 0xa2d0, 0xa2d1, 0xa2d2, 0xa2d3, 0xa2d4, - 0xa2d5, 0xa2d6, 0xa2d7, 0xa2d8, 0xa2d9, 0xa2da, 0xa2db, 0xa2dc, - 0xa2dd, 0xa2de, 0xa2df, 0xa2e0, 0xa2e1, 0xa2e2, 0xa2e3, 0xa2e4, - 0xa2e5, 0xa2e6, 0xa2e7, 0xa2e8, 0xa165, 0xa242, 0xa166, 0xa173, - 0xa15a, 0xa1a5, 0xa2e9, 0xa2ea, 0xa2eb, 0xa2ec, 0xa2ed, 0xa2ee, - 0xa2ef, 0xa2f0, 0xa2f1, 0xa2f2, 0xa2f3, 0xa2f4, 0xa2f5, 0xa2f6, - 0xa2f7, 0xa2f8, 0xa2f9, 0xa2fa, 0xa2fb, 0xa2fc, 0xa2fd, 0xa2fe, - 0xa340, 0xa341, 0xa342, 0xa343, 0xa161, 0xa159, 0xa162, 0xa1e3, - 0, 0, 0xa14b -}; + // insert the new word + if (cursor && wordBaseIdx == cursorBaseIdx && + word->primaryCmp(cursor) > 0) { + w0 = cursor; + w1 = cursor->next; + } else { + w0 = NULL; + w1 = pool[wordBaseIdx - minBaseIdx]; + } + for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ; + word->next = w1; + if (w0) { + w0->next = word; + } else { + pool[wordBaseIdx - minBaseIdx] = word; + } + cursor = word; + cursorBaseIdx = wordBaseIdx; +} -static Gushort cns13Map2[95] = { - // 13648-13742 - 0xa140, 0xa149, 0xa1a8, 0xa1ad, 0xa244, 0xa248, 0xa1ae, - 0xa1a6, 0xa15d, 0xa15e, 0xa1af, 0xa1cf, 0xa141, 0xa1df, 0xa144, - 0xa241, 0xa2af, 0xa2b0, 0xa2b1, 0xa2b2, 0xa2b3, 0xa2b4, 0xa2b5, - 0xa2b6, 0xa2b7, 0xa2b8, 0xa147, 0xa146, 0xa1d5, 0xa1d7, 0xa1d6, - 0xa148, 0xa249, 0xa2cf, 0xa2d0, 0xa2d1, 0xa2d2, 0xa2d3, 0xa2d4, - 0xa2d5, 0xa2d6, 0xa2d7, 0xa2d8, 0xa2d9, 0xa2da, 0xa2db, 0xa2dc, - 0xa2dd, 0xa2de, 0xa2df, 0xa2e0, 0xa2e1, 0xa2e2, 0xa2e3, 0xa2e4, - 0xa2e5, 0xa2e6, 0xa2e7, 0xa2e8, 0xa165, 0xa242, 0xa166, 0xa173, - 0xa15a, 0xa1a5, 0xa2e9, 0xa2ea, 0xa2eb, 0xa2ec, 0xa2ed, 0xa2ee, - 0xa2ef, 0xa2f0, 0xa2f1, 0xa2f2, 0xa2f3, 0xa2f4, 0xa2f5, 0xa2f6, - 0xa2f7, 0xa2f8, 0xa2f9, 0xa2fa, 0xa2fb, 0xa2fc, 0xa2fd, 0xa2fe, - 0xa340, 0xa341, 0xa342, 0xa343, 0xa161, 0xa159, 0xa162, 0xa1c3 -}; +//------------------------------------------------------------------------ +// TextLine +//------------------------------------------------------------------------ -#endif +TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) { + blk = blkA; + rot = rotA; + xMin = yMin = 0; + xMax = yMax = -1; + base = baseA; + words = lastWord = NULL; + text = NULL; + edge = NULL; + col = NULL; + len = 0; + convertedLen = 0; + hyphenated = gFalse; + next = NULL; +} + +TextLine::~TextLine() { + TextWord *word; + + while (words) { + word = words; + words = words->next; + delete word; + } + gfree(text); + gfree(edge); + gfree(col); +} + +void TextLine::addWord(TextWord *word) { + if (lastWord) { + lastWord->next = word; + } else { + words = word; + } + lastWord = word; + + if (xMin > xMax) { + xMin = word->xMin; + xMax = word->xMax; + yMin = word->yMin; + yMax = word->yMax; + } else { + if (word->xMin < xMin) { + xMin = word->xMin; + } + if (word->xMax > xMax) { + xMax = word->xMax; + } + if (word->yMin < yMin) { + yMin = word->yMin; + } + if (word->yMax > yMax) { + yMax = word->yMax; + } + } +} + +double TextLine::primaryDelta(TextLine *line) { + double delta; + + delta = 0; // make gcc happy + switch (rot) { + case 0: + delta = line->xMin - xMax; + break; + case 1: + delta = line->yMin - yMax; + break; + case 2: + delta = xMin - line->xMax; + break; + case 3: + delta = yMin - line->yMax; + break; + } + return delta; +} + +int TextLine::primaryCmp(TextLine *line) { + double cmp; + + cmp = 0; // make gcc happy + switch (rot) { + case 0: + cmp = xMin - line->xMin; + break; + case 1: + cmp = yMin - line->yMin; + break; + case 2: + cmp = line->xMax - xMax; + break; + case 3: + cmp = line->yMax - yMax; + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextLine::secondaryCmp(TextLine *line) { + double cmp; + + cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base; + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextLine::cmpYX(TextLine *line) { + int cmp; + + if ((cmp = secondaryCmp(line))) { + return cmp; + } + return primaryCmp(line); +} + +int TextLine::cmpXY(const void *p1, const void *p2) { + TextLine *line1 = *(TextLine **)p1; + TextLine *line2 = *(TextLine **)p2; + int cmp; + + if ((cmp = line1->primaryCmp(line2))) { + return cmp; + } + return line1->secondaryCmp(line2); +} + +void TextLine::coalesce(UnicodeMap *uMap) { + TextWord *word0, *word1; + double space, delta, minSpace; + GBool isUnicode; + char buf[8]; + int i, j; + + if (words->next) { + + // compute the inter-word space threshold + if (words->len > 1 || words->next->len > 1) { + minSpace = 0; + } else { + minSpace = words->primaryDelta(words->next); + for (word0 = words->next, word1 = word0->next; + word1 && minSpace > 0; + word0 = word1, word1 = word0->next) { + if (word1->len > 1) { + minSpace = 0; + } + delta = word0->primaryDelta(word1); + if (delta < minSpace) { + minSpace = delta; + } + } + } + if (minSpace <= 0) { + space = maxCharSpacing * words->fontSize; + } else { + space = maxWideCharSpacingMul * minSpace; + } + + // merge words + word0 = words; + word1 = words->next; + while (word1) { + if (word0->primaryDelta(word1) >= space) { + word0->spaceAfter = gTrue; + word0 = word1; + word1 = word1->next; + } else if (word0->font == word1->font && + fabs(word0->fontSize - word1->fontSize) < + maxWordFontSizeDelta * words->fontSize && + word1->charPos == word0->charPos + word0->charLen) { + word0->merge(word1); + word0->next = word1->next; + delete word1; + word1 = word0->next; + } else { + word0 = word1; + word1 = word1->next; + } + } + } + + // build the line text + isUnicode = uMap ? uMap->isUnicode() : gFalse; + len = 0; + for (word1 = words; word1; word1 = word1->next) { + len += word1->len; + if (word1->spaceAfter) { + ++len; + } + } + text = (Unicode *)gmalloc(len * sizeof(Unicode)); + edge = (double *)gmalloc((len + 1) * sizeof(double)); + i = 0; + for (word1 = words; word1; word1 = word1->next) { + for (j = 0; j < word1->len; ++j) { + text[i] = word1->text[j]; + edge[i] = word1->edge[j]; + ++i; + } + edge[i] = word1->edge[word1->len]; + if (word1->spaceAfter) { + text[i] = (Unicode)0x0020; + ++i; + } + } + + // compute convertedLen and set up the col array + col = (int *)gmalloc((len + 1) * sizeof(int)); + convertedLen = 0; + for (i = 0; i < len; ++i) { + col[i] = convertedLen; + if (isUnicode) { + ++convertedLen; + } else if (uMap) { + convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf)); + } + } + col[len] = convertedLen; + + // check for hyphen at end of line + //~ need to check for other chars used as hyphens + hyphenated = text[len - 1] == (Unicode)'-'; +} //------------------------------------------------------------------------ -// TextString +// TextLineFrag //------------------------------------------------------------------------ -TextString::TextString(GfxState *state, GBool hexCodes1) { - double x, y, h; - - state->transform(state->getCurX(), state->getCurY(), &x, &y); - h = state->getTransformedFontSize(); - //~ yMin/yMax computation should use font ascent/descent values - yMin = y - 0.95 * h; - yMax = yMin + 1.3 * h; - col = 0; - text = new GString(); - xRight = NULL; - yxNext = NULL; - xyNext = NULL; - hexCodes = hexCodes1; -} - -TextString::~TextString() { - delete text; - gfree(xRight); -} - -void TextString::addChar(GfxState *state, double x, double y, - double dx, double dy, - Guchar c, TextOutputCharSet charSet) { - char *charName, *sub; - int c1; - int i, j, n, m; - - // get current index - i = text->getLength(); - - // append translated character(s) to string - sub = NULL; - n = 1; - if ((charName = state->getFont()->getCharName(c))) { - if ((c1 = generalSubstEncoding.getCharCode(charName)) >= 0) { - charName = generalSubst[c1]; - } - switch (charSet) { - case textOutASCII7: - c1 = ascii7Encoding.getCharCode(charName); +class TextLineFrag { +public: + + TextLine *line; // the line object + int start, len; // offset and length of this fragment + // (in Unicode chars) + double xMin, xMax; // bounding box coordinates + double yMin, yMax; + double base; // baseline virtual coordinate + int col; // first column + + void init(TextLine *lineA, int startA, int lenA); + void computeCoords(GBool oneRot); + + static int cmpYXPrimaryRot(const void *p1, const void *p2); + static int cmpYXLineRot(const void *p1, const void *p2); + static int cmpXYLineRot(const void *p1, const void *p2); +}; + +void TextLineFrag::init(TextLine *lineA, int startA, int lenA) { + line = lineA; + start = startA; + len = lenA; + col = line->col[start]; +} + +void TextLineFrag::computeCoords(GBool oneRot) { + TextBlock *blk; + double d0, d1, d2, d3, d4; + + if (oneRot) { + + switch (line->rot) { + case 0: + xMin = line->edge[start]; + xMax = line->edge[start + len]; + yMin = line->yMin; + yMax = line->yMax; break; - case textOutLatin1: - c1 = isoLatin1Encoding.getCharCode(charName); + case 1: + xMin = line->xMin; + xMax = line->xMax; + yMin = line->edge[start]; + yMax = line->edge[start + len]; break; - case textOutLatin2: - c1 = isoLatin2Encoding.getCharCode(charName); + case 2: + xMin = line->edge[start + len]; + xMax = line->edge[start]; + yMin = line->yMin; + yMax = line->yMax; break; - case textOutLatin5: - c1 = isoLatin5Encoding.getCharCode(charName); + case 3: + xMin = line->xMin; + xMax = line->xMax; + yMin = line->edge[start + len]; + yMax = line->edge[start]; break; } - if (c1 < 0) { - m = strlen(charName); - if (hexCodes && m == 3 && - (charName[0] == 'B' || charName[0] == 'C' || - charName[0] == 'G') && - isxdigit(charName[1]) && isxdigit(charName[2])) { - sscanf(charName+1, "%x", &c1); - } else if (hexCodes && m == 2 && - isxdigit(charName[0]) && isxdigit(charName[1])) { - sscanf(charName, "%x", &c1); - } else if (!hexCodes && m >= 2 && m <= 3 && - isdigit(charName[0]) && isdigit(charName[1])) { - c1 = atoi(charName); - if (c1 >= 256) { - c1 = -1; - } - } else if (m >= 3 && m <= 5 && isdigit(charName[1])) { - c1 = atoi(charName+1); - if (c1 >= 256) { - c1 = -1; - } - } - //~ this is a kludge -- is there a standard internal encoding - //~ used by all/most Type 1 fonts? - if (c1 == 262) // hyphen - c1 = 45; - else if (c1 == 266) // emdash - c1 = 208; - if (c1 >= 0) { - charName = isoLatin1Encoding.getCharName(c1); - if (charName) { - switch (charSet) { - case textOutASCII7: - c1 = ascii7Encoding.getCharCode(charName); - break; - case textOutLatin1: - // no translation - break; - case textOutLatin2: - c1 = isoLatin2Encoding.getCharCode(charName); - break; - case textOutLatin5: - c1 = isoLatin5Encoding.getCharCode(charName); - break; - } - } else { - c1 = -1; - } + base = line->base; + + } else { + + if (line->rot == 0 && line->blk->page->primaryRot == 0) { + + xMin = line->edge[start]; + xMax = line->edge[start + len]; + yMin = line->yMin; + yMax = line->yMax; + base = line->base; + + } else { + + blk = line->blk; + d0 = line->edge[start]; + d1 = line->edge[start + len]; + d2 = d3 = d4 = 0; // make gcc happy + + switch (line->rot) { + case 0: + d2 = line->yMin; + d3 = line->yMax; + d4 = line->base; + d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin); + d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin); + d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin); + d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin); + d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin); + break; + case 1: + d2 = line->xMax; + d3 = line->xMin; + d4 = line->base; + d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin); + d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin); + d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin); + d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin); + d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin); + break; + case 2: + d2 = line->yMax; + d3 = line->yMin; + d4 = line->base; + d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin); + d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin); + d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin); + d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin); + d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin); + break; + case 3: + d2 = line->xMin; + d3 = line->xMax; + d4 = line->base; + d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin); + d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin); + d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin); + d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin); + d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin); + break; } - } - switch (charSet) { - case textOutASCII7: - if (c1 >= 128) { - sub = ascii7Subst[c1 - 128]; - n = strlen(sub); - } + + switch (line->blk->page->primaryRot) { + case 0: + xMin = blk->xMin + d0 * (blk->xMax - blk->xMin); + xMax = blk->xMin + d1 * (blk->xMax - blk->xMin); + yMin = blk->yMin + d2 * (blk->yMax - blk->yMin); + yMax = blk->yMin + d3 * (blk->yMax - blk->yMin); + base = blk->yMin + base * (blk->yMax - blk->yMin); break; - case textOutLatin1: - if (c1 >= 256) { - sub = isoLatin1Subst[c1 - 256]; - n = strlen(sub); - } + case 1: + xMin = blk->xMax - d3 * (blk->xMax - blk->xMin); + xMax = blk->xMax - d2 * (blk->xMax - blk->xMin); + yMin = blk->yMin + d0 * (blk->yMax - blk->yMin); + yMax = blk->yMin + d1 * (blk->yMax - blk->yMin); + base = blk->xMax - d4 * (blk->xMax - blk->xMin); break; - case textOutLatin2: - if (c1 >= 256) { - sub = isoLatin2Subst[c1 - 256]; - n = strlen(sub); - } + case 2: + xMin = blk->xMax - d1 * (blk->xMax - blk->xMin); + xMax = blk->xMax - d0 * (blk->xMax - blk->xMin); + yMin = blk->yMax - d3 * (blk->yMax - blk->yMin); + yMax = blk->yMax - d2 * (blk->yMax - blk->yMin); + base = blk->yMax - d4 * (blk->yMax - blk->yMin); break; - case textOutLatin5: - if (c1 >= 256) { - sub = isoLatin5Subst[c1 - 256]; - n = strlen(sub); - } + case 3: + xMin = blk->xMin + d2 * (blk->xMax - blk->xMin); + xMax = blk->xMin + d3 * (blk->xMax - blk->xMin); + yMin = blk->yMax - d1 * (blk->yMax - blk->yMin); + yMax = blk->yMax - d0 * (blk->yMax - blk->yMin); + base = blk->xMin + d4 * (blk->xMax - blk->xMin); break; + } + } - } else { - c1 = -1; - } - if (sub) - text->append(sub); - else if (c1 >= ' ') - text->append((char)c1); - else - text->append(' '); - - // update position information - if (i+n > ((i+15) & ~15)) - xRight = (double *)grealloc(xRight, ((i+n+15) & ~15) * sizeof(double)); - if (i == 0) - xMin = x; - for (j = 0; j < n; ++j) - xRight[i+j] = x + ((j+1) * dx) / n; - xMax = x + dx; -} - -void TextString::addChar16(GfxState *state, double x, double y, - double dx, double dy, - int c, GfxFontCharSet16 charSet) { - int c1, t1, t2; - int sub[8]; - char *p; - int *q; - int i, j, n; + } +} - // get current index - i = text->getLength(); - - // convert the 16-bit character - c1 = 0; - sub[0] = 0; - switch (charSet) { - - // convert Adobe-Japan1-2 to JIS X 0208-1983 - case font16AdobeJapan12: -#if JAPANESE_SUPPORT - if (c <= 96) { - c1 = 0x8080 + japan12Map[c]; - } else if (c <= 632) { - if (c <= 230) - c1 = 0; - else if (c <= 324) - c1 = 0x8080 + japan12Map[c - 230]; - else if (c <= 421) - c1 = 0x8080 + japan12KanaMap1[c - 325]; - else if (c <= 500) - c1 = 0; - else if (c <= 598) - c1 = 0x8080 + japan12KanaMap2[c - 501]; - else - c1 = 0; - } else if (c <= 1124) { - if (c <= 779) { - if (c <= 726) - c1 = 0xa1a1 + (c - 633); - else if (c <= 740) - c1 = 0xa2a1 + (c - 727); - else if (c <= 748) - c1 = 0xa2ba + (c - 741); - else if (c <= 755) - c1 = 0xa2ca + (c - 749); - else if (c <= 770) - c1 = 0xa2dc + (c - 756); - else if (c <= 778) - c1 = 0xa2f2 + (c - 771); - else - c1 = 0xa2fe; - } else if (c <= 841) { - if (c <= 789) - c1 = 0xa3b0 + (c - 780); - else if (c <= 815) - c1 = 0xa3c1 + (c - 790); - else - c1 = 0xa3e1 + (c - 816); - } else if (c <= 1010) { - if (c <= 924) - c1 = 0xa4a1 + (c - 842); - else - c1 = 0xa5a1 + (c - 925); - } else { - if (c <= 1034) - c1 = 0xa6a1 + (c - 1011); - else if (c <= 1058) - c1 = 0xa6c1 + (c - 1035); - else if (c <= 1091) - c1 = 0xa7a1 + (c - 1059); - else - c1 = 0xa7d1 + (c - 1092); - } - } else if (c <= 4089) { - t1 = (c - 1125) / 94; - t2 = (c - 1125) % 94; - c1 = 0xb0a1 + (t1 << 8) + t2; - } else if (c <= 7477) { - t1 = (c - 4090) / 94; - t2 = (c - 4090) % 94; - c1 = 0xd0a1 + (t1 << 8) + t2; - } else if (c <= 7554) { - c1 = 0; - } else if (c <= 7563) { // circled Arabic numbers 1..9 - c1 = 0xa3b1 + (c - 7555); - } else if (c <= 7574) { // circled Arabic numbers 10..20 - t1 = c - 7564 + 10; - sub[0] = 0xa3b0 + (t1 / 10); - sub[1] = 0xa3b0 + (t1 % 10); - sub[2] = 0; - c1 = -1; - } else if (c <= 7584) { // Roman numbers I..X - for (p = japan12Roman[c - 7575], q = sub; *p; ++p, ++q) { - *q = 0xa380 + *p; - } - *q = 0; - c1 = -1; - } else if (c <= 7632) { - if (c <= 7600) { - c1 = 0; - } else if (c <= 7606) { - for (p = japan12Abbrev1[c - 7601], q = sub; *p; ++p, ++q) { - *q = 0xa380 + *p; - } - *q = 0; - c1 = -1; - } else { - c1 = 0; - } - } else { - c1 = 0; +int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) { + TextLineFrag *frag1 = (TextLineFrag *)p1; + TextLineFrag *frag2 = (TextLineFrag *)p2; + double cmp; + + cmp = 0; // make gcc happy + switch (frag1->line->blk->page->primaryRot) { + case 0: + if ((cmp = frag1->yMin - frag2->yMin) == 0) { + cmp = frag1->xMin - frag2->xMin; + } + break; + case 1: + if ((cmp = frag2->xMax - frag1->xMax) == 0) { + cmp = frag1->yMin - frag2->yMin; } -#if 0 //~ - if (c1 == 0) { - error(-1, "Unsupported Adobe-Japan1-2 character: %d", c); + break; + case 2: + if ((cmp = frag2->yMin - frag1->yMin) == 0) { + cmp = frag2->xMax - frag1->xMax; } -#endif -#endif // JAPANESE_SUPPORT break; - - case font16AdobeGB12: -#if CHINESE_GB_SUPPORT -#endif + case 3: + if ((cmp = frag1->xMax - frag2->xMax) == 0) { + cmp = frag2->yMax - frag1->yMax; + } break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} - case font16AdobeCNS13: -#if CHINESE_CNS_SUPPORT - if (c <= 98) { - c1 = cns13Map1[c]; - } else if (c <= 502) { - if (c == 247) { - c1 = 0xa1f7; - } else if (c == 248) { - c1 = 0xa1f6; - } else { - t1 = (c - 99) / 157; - t2 = (c - 99) % 157; - if (t2 <= 62) { - c1 = 0xa140 + (t1 << 8) + t2; - } else { - c1 = 0xa162 + (t1 << 8) + t2; +int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) { + TextLineFrag *frag1 = (TextLineFrag *)p1; + TextLineFrag *frag2 = (TextLineFrag *)p2; + double cmp; + + cmp = 0; // make gcc happy + switch (frag1->line->rot) { + case 0: + if ((cmp = frag1->yMin - frag2->yMin) == 0) { + cmp = frag1->xMin - frag2->xMin; + } + break; + case 1: + if ((cmp = frag2->xMax - frag1->xMax) == 0) { + cmp = frag1->yMin - frag2->yMin; + } + break; + case 2: + if ((cmp = frag2->yMin - frag1->yMin) == 0) { + cmp = frag2->xMax - frag1->xMax; + } + break; + case 3: + if ((cmp = frag1->xMax - frag2->xMax) == 0) { + cmp = frag2->yMax - frag1->yMax; + } + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) { + TextLineFrag *frag1 = (TextLineFrag *)p1; + TextLineFrag *frag2 = (TextLineFrag *)p2; + double cmp; + + cmp = 0; // make gcc happy + switch (frag1->line->rot) { + case 0: + if ((cmp = frag1->xMin - frag2->xMin) == 0) { + cmp = frag1->yMin - frag2->yMin; + } + break; + case 1: + if ((cmp = frag1->yMin - frag2->yMin) == 0) { + cmp = frag2->xMax - frag1->xMax; + } + break; + case 2: + if ((cmp = frag2->xMax - frag1->xMax) == 0) { + cmp = frag2->yMin - frag1->yMin; + } + break; + case 3: + if ((cmp = frag2->yMax - frag1->yMax) == 0) { + cmp = frag1->xMax - frag2->xMax; + } + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +//------------------------------------------------------------------------ +// TextBlock +//------------------------------------------------------------------------ + +TextBlock::TextBlock(TextPage *pageA, int rotA) { + page = pageA; + rot = rotA; + xMin = yMin = 0; + xMax = yMax = -1; + priMin = 0; + priMax = page->pageWidth; + pool = new TextPool(); + lines = NULL; + curLine = NULL; + next = NULL; + stackNext = NULL; +} + +TextBlock::~TextBlock() { + TextLine *line; + + delete pool; + while (lines) { + line = lines; + lines = lines->next; + delete line; + } +} + +void TextBlock::addWord(TextWord *word) { + pool->addWord(word); + if (xMin > xMax) { + xMin = word->xMin; + xMax = word->xMax; + yMin = word->yMin; + yMax = word->yMax; + } else { + if (word->xMin < xMin) { + xMin = word->xMin; + } + if (word->xMax > xMax) { + xMax = word->xMax; + } + if (word->yMin < yMin) { + yMin = word->yMin; + } + if (word->yMax > yMax) { + yMax = word->yMax; + } + } +} + +void TextBlock::coalesce(UnicodeMap *uMap) { + TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord; + TextLine *line, *line0, *line1; + int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx; + int baseIdx, bestWordBaseIdx, idx0, idx1; + double minBase, maxBase; + double fontSize, delta, priDelta, secDelta; + TextLine **lineArray; + GBool found; + int col1, col2; + int i, j, k; + + // discard duplicated text (fake boldface, drop shadows) + for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) { + word0 = pool->getPool(idx0); + while (word0) { + priDelta = dupMaxPriDelta * word0->fontSize; + secDelta = dupMaxSecDelta * word0->fontSize; + if (rot == 0 || rot == 3) { + maxBaseIdx = pool->getBaseIdx(word0->base + secDelta); + } else { + maxBaseIdx = pool->getBaseIdx(word0->base - secDelta); + } + found = gFalse; + word1 = word2 = NULL; // make gcc happy + for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) { + if (idx1 == idx0) { + word1 = word0; + word2 = word0->next; + } else { + word1 = NULL; + word2 = pool->getPool(idx1); + } + for (; word2; word1 = word2, word2 = word2->next) { + if (word2->len == word0->len && + !memcmp(word2->text, word0->text, + word0->len * sizeof(Unicode))) { + switch (rot) { + case 0: + case 2: + found = fabs(word0->xMin - word2->xMin) < priDelta && + fabs(word0->xMax - word2->xMax) < priDelta && + fabs(word0->yMin - word2->yMin) < secDelta && + fabs(word0->yMax - word2->yMax) < secDelta; + break; + case 1: + case 3: + found = fabs(word0->xMin - word2->xMin) < secDelta && + fabs(word0->xMax - word2->xMax) < secDelta && + fabs(word0->yMin - word2->yMin) < priDelta && + fabs(word0->yMax - word2->yMax) < priDelta; + break; + } + } + if (found) { + break; + } + } + if (found) { + break; + } + } + if (found) { + if (word1) { + word1->next = word2->next; + } else { + pool->setPool(idx1, word2->next); + } + delete word2; + } else { + word0 = word0->next; + } + } + } + + // build the lines + curLine = NULL; + poolMinBaseIdx = pool->minBaseIdx; + charCount = 0; + nLines = 0; + while (1) { + + // find the first non-empty line in the pool + for (; + poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); + ++poolMinBaseIdx) ; + if (poolMinBaseIdx > pool->maxBaseIdx) { + break; + } + + // look for the left-most word in the first four lines of the + // pool -- this avoids starting with a superscript word + startBaseIdx = poolMinBaseIdx; + for (baseIdx = poolMinBaseIdx + 1; + baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; + ++baseIdx) { + if (!pool->getPool(baseIdx)) { + continue; + } + if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) + < 0) { + startBaseIdx = baseIdx; + } + } + + // create a new line + word0 = pool->getPool(startBaseIdx); + pool->setPool(startBaseIdx, word0->next); + word0->next = NULL; + line = new TextLine(this, word0->rot, word0->base); + line->addWord(word0); + lastWord = word0; + + // compute the search range + fontSize = word0->fontSize; + minBase = word0->base - maxIntraLineDelta * fontSize; + maxBase = word0->base + maxIntraLineDelta * fontSize; + minBaseIdx = pool->getBaseIdx(minBase); + maxBaseIdx = pool->getBaseIdx(maxBase); + + // find the rest of the words in this line + while (1) { + + // find the left-most word whose baseline is in the range for + // this line + bestWordBaseIdx = 0; + bestWord0 = bestWord1 = NULL; + for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { + for (word0 = NULL, word1 = pool->getPool(baseIdx); + word1; + word0 = word1, word1 = word1->next) { + if (word1->base >= minBase && + word1->base <= maxBase && + (delta = lastWord->primaryDelta(word1)) >= + minCharSpacing * fontSize) { + if (delta < maxWordSpacing * fontSize && + (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) { + bestWordBaseIdx = baseIdx; + bestWord0 = word0; + bestWord1 = word1; + } + break; + } + } + } + if (!bestWord1) { + break; + } + + // remove it from the pool, and add it to the line + if (bestWord0) { + bestWord0->next = bestWord1->next; + } else { + pool->setPool(bestWordBaseIdx, bestWord1->next); + } + bestWord1->next = NULL; + line->addWord(bestWord1); + lastWord = bestWord1; + } + + // add the line + if (curLine && line->cmpYX(curLine) > 0) { + line0 = curLine; + line1 = curLine->next; + } else { + line0 = NULL; + line1 = lines; + } + for (; + line1 && line->cmpYX(line1) > 0; + line0 = line1, line1 = line1->next) ; + if (line0) { + line0->next = line; + } else { + lines = line; + } + line->next = line1; + curLine = line; + line->coalesce(uMap); + charCount += line->len; + ++nLines; + } + + // sort lines into xy order for column assignment + lineArray = (TextLine **)gmalloc(nLines * sizeof(TextLine *)); + for (line = lines, i = 0; line; line = line->next, ++i) { + lineArray[i] = line; + } + qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY); + + // column assignment + nColumns = 0; + for (i = 0; i < nLines; ++i) { + line0 = lineArray[i]; + col1 = 0; + for (j = 0; j < i; ++j) { + line1 = lineArray[j]; + if (line1->primaryDelta(line0) >= 0) { + col2 = line1->col[line1->len] + 1; + } else { + k = 0; // make gcc happy + switch (rot) { + case 0: + for (k = 0; + k < line1->len && + line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + case 1: + for (k = 0; + k < line1->len && + line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + case 2: + for (k = 0; + k < line1->len && + line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + case 3: + for (k = 0; + k < line1->len && + line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); + ++k) ; + break; + } + col2 = line1->col[k]; + } + if (col2 > col1) { + col1 = col2; + } + } + for (k = 0; k <= line0->len; ++k) { + line0->col[k] += col1; + } + if (line0->col[line0->len] > nColumns) { + nColumns = line0->col[line0->len]; + } + } + gfree(lineArray); +} + +void TextBlock::updatePriMinMax(TextBlock *blk) { + double newPriMin, newPriMax; + GBool gotPriMin, gotPriMax; + + gotPriMin = gotPriMax = gFalse; + newPriMin = newPriMax = 0; // make gcc happy + switch (page->primaryRot) { + case 0: + case 2: + if (blk->yMin < yMax && blk->yMax > yMin) { + if (blk->xMin < xMin) { + newPriMin = blk->xMax; + gotPriMin = gTrue; + } + if (blk->xMax > xMax) { + newPriMax = blk->xMin; + gotPriMax = gTrue; + } + } + break; + case 1: + case 3: + if (blk->xMin < xMax && blk->xMax > xMin) { + if (blk->yMin < yMin) { + newPriMin = blk->yMax; + gotPriMin = gTrue; + } + if (blk->yMax > yMax) { + newPriMax = blk->yMin; + gotPriMax = gTrue; + } + } + break; + } + if (gotPriMin) { + if (newPriMin > xMin) { + newPriMin = xMin; + } + if (newPriMin > priMin) { + priMin = newPriMin; + } + } + if (gotPriMax) { + if (newPriMax < xMax) { + newPriMax = xMax; + } + if (newPriMax < priMax) { + priMax = newPriMax; + } + } +} + +int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) { + TextBlock *blk1 = *(TextBlock **)p1; + TextBlock *blk2 = *(TextBlock **)p2; + double cmp; + + cmp = 0; // make gcc happy + switch (blk1->page->primaryRot) { + case 0: + if ((cmp = blk1->xMin - blk2->xMin) == 0) { + cmp = blk1->yMin - blk2->yMin; + } + break; + case 1: + if ((cmp = blk1->yMin - blk2->yMin) == 0) { + cmp = blk2->xMax - blk1->xMax; + } + break; + case 2: + if ((cmp = blk2->xMax - blk1->xMax) == 0) { + cmp = blk2->yMin - blk1->yMin; + } + break; + case 3: + if ((cmp = blk2->yMax - blk1->yMax) == 0) { + cmp = blk1->xMax - blk2->xMax; + } + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) { + TextBlock *blk1 = *(TextBlock **)p1; + TextBlock *blk2 = *(TextBlock **)p2; + double cmp; + + cmp = 0; // make gcc happy + switch (blk1->page->primaryRot) { + case 0: + if ((cmp = blk1->yMin - blk2->yMin) == 0) { + cmp = blk1->xMin - blk2->xMin; + } + break; + case 1: + if ((cmp = blk2->xMax - blk1->xMax) == 0) { + cmp = blk1->yMin - blk2->yMin; + } + break; + case 2: + if ((cmp = blk2->yMin - blk1->yMin) == 0) { + cmp = blk2->xMax - blk1->xMax; + } + break; + case 3: + if ((cmp = blk1->xMax - blk2->xMax) == 0) { + cmp = blk2->yMax - blk1->yMax; + } + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +int TextBlock::primaryCmp(TextBlock *blk) { + double cmp; + + cmp = 0; // make gcc happy + switch (rot) { + case 0: + cmp = xMin - blk->xMin; + break; + case 1: + cmp = yMin - blk->yMin; + break; + case 2: + cmp = blk->xMax - xMax; + break; + case 3: + cmp = blk->yMax - yMax; + break; + } + return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; +} + +double TextBlock::secondaryDelta(TextBlock *blk) { + double delta; + + delta = 0; // make gcc happy + switch (rot) { + case 0: + delta = blk->yMin - yMax; + break; + case 1: + delta = xMin - blk->xMax; + break; + case 2: + delta = yMin - blk->yMax; + break; + case 3: + delta = blk->xMin - xMax; + break; + } + return delta; +} + +GBool TextBlock::isBelow(TextBlock *blk) { + GBool below; + + below = gFalse; // make gcc happy + switch (page->primaryRot) { + case 0: + below = xMin >= blk->priMin && xMax <= blk->priMax && + yMin > blk->yMin; + break; + case 1: + below = yMin >= blk->priMin && yMax <= blk->priMax && + xMax < blk->xMax; + break; + case 2: + below = xMin >= blk->priMin && xMax <= blk->priMax && + yMax < blk->yMax; + break; + case 3: + below = yMin >= blk->priMin && yMax <= blk->priMax && + xMin > blk->xMin; + break; + } + + return below; +} + +//------------------------------------------------------------------------ +// TextFlow +//------------------------------------------------------------------------ + +TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) { + page = pageA; + xMin = blk->xMin; + xMax = blk->xMax; + yMin = blk->yMin; + yMax = blk->yMax; + priMin = blk->priMin; + priMax = blk->priMax; + blocks = lastBlk = blk; + next = NULL; +} + +TextFlow::~TextFlow() { + TextBlock *blk; + + while (blocks) { + blk = blocks; + blocks = blocks->next; + delete blk; + } +} + +void TextFlow::addBlock(TextBlock *blk) { + if (lastBlk) { + lastBlk->next = blk; + } else { + blocks = blk; + } + lastBlk = blk; + if (blk->xMin < xMin) { + xMin = blk->xMin; + } + if (blk->xMax > xMax) { + xMax = blk->xMax; + } + if (blk->yMin < yMin) { + yMin = blk->yMin; + } + if (blk->yMax > yMax) { + yMax = blk->yMax; + } +} + +GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) { + GBool fits; + + // lower blocks must use smaller fonts + if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) { + return gFalse; + } + + fits = gFalse; // make gcc happy + switch (page->primaryRot) { + case 0: + fits = blk->xMin >= priMin && blk->xMax <= priMax; + break; + case 1: + fits = blk->yMin >= priMin && blk->yMax <= priMax; + break; + case 2: + fits = blk->xMin >= priMin && blk->xMax <= priMax; + break; + case 3: + fits = blk->yMin >= priMin && blk->yMax <= priMax; + break; + } + return fits; +} + +#if TEXTOUT_WORD_LIST + +//------------------------------------------------------------------------ +// TextWordList +//------------------------------------------------------------------------ + +TextWordList::TextWordList(TextPage *text, GBool physLayout) { + TextFlow *flow; + TextBlock *blk; + TextLine *line; + TextWord *word; + TextWord **wordArray; + int nWords, i; + + words = new GList(); + + if (text->rawOrder) { + for (word = text->rawWords; word; word = word->next) { + words->append(word); + } + + } else if (physLayout) { + // this is inefficient, but it's also the least useful of these + // three cases + nWords = 0; + for (flow = text->flows; flow; flow = flow->next) { + for (blk = flow->blocks; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + for (word = line->words; word; word = word->next) { + ++nWords; + } + } + } + } + wordArray = (TextWord **)gmalloc(nWords * sizeof(TextWord *)); + i = 0; + for (flow = text->flows; flow; flow = flow->next) { + for (blk = flow->blocks; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + for (word = line->words; word; word = word->next) { + wordArray[i++] = word; + } + } + } + } + qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX); + for (i = 0; i < nWords; ++i) { + words->append(wordArray[i]); + } + gfree(wordArray); + + } else { + for (flow = text->flows; flow; flow = flow->next) { + for (blk = flow->blocks; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + for (word = line->words; word; word = word->next) { + words->append(word); + } + } + } + } + } +} + +TextWordList::~TextWordList() { + delete words; +} + +int TextWordList::getLength() { + return words->getLength(); +} + +TextWord *TextWordList::get(int idx) { + if (idx < 0 || idx >= words->getLength()) { + return NULL; + } + return (TextWord *)words->get(idx); +} + +#endif // TEXTOUT_WORD_LIST + +//------------------------------------------------------------------------ +// TextPage +//------------------------------------------------------------------------ + +TextPage::TextPage(GBool rawOrderA) { + int rot; + + rawOrder = rawOrderA; + curWord = NULL; + charPos = 0; + curFont = NULL; + curFontSize = 0; + nest = 0; + nTinyChars = 0; + lastCharOverlap = gFalse; + if (!rawOrder) { + for (rot = 0; rot < 4; ++rot) { + pools[rot] = new TextPool(); + } + } + flows = NULL; + blocks = NULL; + rawWords = NULL; + rawLastWord = NULL; + fonts = new GList(); + lastFindXMin = lastFindYMin = 0; + haveLastFind = gFalse; +} + +TextPage::~TextPage() { + int rot; + + clear(); + if (!rawOrder) { + for (rot = 0; rot < 4; ++rot) { + delete pools[rot]; + } + } + delete fonts; +} + +void TextPage::startPage(GfxState *state) { + clear(); + if (state) { + pageWidth = state->getPageWidth(); + pageHeight = state->getPageHeight(); + } else { + pageWidth = pageHeight = 0; + } +} + +void TextPage::endPage() { + if (curWord) { + endWord(); + } +} + +void TextPage::clear() { + int rot; + TextFlow *flow; + TextWord *word; + + if (curWord) { + delete curWord; + curWord = NULL; + } + if (rawOrder) { + while (rawWords) { + word = rawWords; + rawWords = rawWords->next; + delete word; + } + } else { + for (rot = 0; rot < 4; ++rot) { + delete pools[rot]; + } + while (flows) { + flow = flows; + flows = flows->next; + delete flow; + } + gfree(blocks); + } + deleteGList(fonts, TextFontInfo); + + curWord = NULL; + charPos = 0; + curFont = NULL; + curFontSize = 0; + nest = 0; + nTinyChars = 0; + if (!rawOrder) { + for (rot = 0; rot < 4; ++rot) { + pools[rot] = new TextPool(); + } + } + flows = NULL; + blocks = NULL; + rawWords = NULL; + rawLastWord = NULL; + fonts = new GList(); +} + +void TextPage::updateFont(GfxState *state) { + GfxFont *gfxFont; + double *fm; + char *name; + int code, mCode, letterCode, anyCode; + double w; + int i; + + // get the font info object + curFont = NULL; + for (i = 0; i < fonts->getLength(); ++i) { + curFont = (TextFontInfo *)fonts->get(i); + if (curFont->matches(state)) { + break; + } + curFont = NULL; + } + if (!curFont) { + curFont = new TextFontInfo(state); + fonts->append(curFont); + } + + // adjust the font size + gfxFont = state->getFont(); + curFontSize = state->getTransformedFontSize(); + if (gfxFont && gfxFont->getType() == fontType3) { + // This is a hack which makes it possible to deal with some Type 3 + // fonts. The problem is that it's impossible to know what the + // base coordinate system used in the font is without actually + // rendering the font. This code tries to guess by looking at the + // width of the character 'm' (which breaks if the font is a + // subset that doesn't contain 'm'). + mCode = letterCode = anyCode = -1; + for (code = 0; code < 256; ++code) { + name = ((Gfx8BitFont *)gfxFont)->getCharName(code); + if (name && name[0] == 'm' && name[1] == '\0') { + mCode = code; + } + if (letterCode < 0 && name && name[1] == '\0' && + ((name[0] >= 'A' && name[0] <= 'Z') || + (name[0] >= 'a' && name[0] <= 'z'))) { + letterCode = code; + } + if (anyCode < 0 && name && + ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) { + anyCode = code; + } + } + if (mCode >= 0 && + (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) { + // 0.6 is a generic average 'm' width -- yes, this is a hack + curFontSize *= w / 0.6; + } else if (letterCode >= 0 && + (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) { + // even more of a hack: 0.5 is a generic letter width + curFontSize *= w / 0.5; + } else if (anyCode >= 0 && + (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) { + // better than nothing: 0.5 is a generic character width + curFontSize *= w / 0.5; + } + fm = gfxFont->getFontMatrix(); + if (fm[0] != 0) { + curFontSize *= fabs(fm[3] / fm[0]); + } + } +} + +void TextPage::beginWord(GfxState *state, double x0, double y0) { + double *txtm, *ctm, *fontm; + double m[4], m2[4]; + int rot; + + // This check is needed because Type 3 characters can contain + // text-drawing operations (when TextPage is being used via + // {X,Win}SplashOutputDev rather than TextOutputDev). + if (curWord) { + ++nest; + return; + } + + // compute the rotation + txtm = state->getTextMat(); + ctm = state->getCTM(); + m[0] = txtm[0] * ctm[0] + txtm[1] * ctm[2]; + m[1] = txtm[0] * ctm[1] + txtm[1] * ctm[3]; + m[2] = txtm[2] * ctm[0] + txtm[3] * ctm[2]; + m[3] = txtm[2] * ctm[1] + txtm[3] * ctm[3]; + if (state->getFont()->getType() == fontType3) { + fontm = state->getFont()->getFontMatrix(); + m2[0] = fontm[0] * m[0] + fontm[1] * m[2]; + m2[1] = fontm[0] * m[1] + fontm[1] * m[3]; + m2[2] = fontm[2] * m[0] + fontm[3] * m[2]; + m2[3] = fontm[2] * m[1] + fontm[3] * m[3]; + m[0] = m2[0]; + m[1] = m2[1]; + m[2] = m2[2]; + m[3] = m2[3]; + } + if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) { + rot = (m[3] < 0) ? 0 : 2; + } else { + rot = (m[2] > 0) ? 1 : 3; + } + + curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize); +} + +void TextPage::addChar(GfxState *state, double x, double y, + double dx, double dy, + CharCode c, Unicode *u, int uLen) { + double x1, y1, w1, h1, dx2, dy2, base, sp; + int i; + + // if the previous char was a space, addChar will have called + // endWord, so we need to start a new word + if (!curWord) { + beginWord(state, x, y); + } + + // throw away chars that aren't inside the page bounds + state->transform(x, y, &x1, &y1); + if (x1 < 0 || x1 > pageWidth || + y1 < 0 || y1 > pageHeight) { + return; + } + + // subtract char and word spacing from the dx,dy values + sp = state->getCharSpace(); + if (c == (CharCode)0x20) { + sp += state->getWordSpace(); + } + state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); + dx -= dx2; + dy -= dy2; + state->transformDelta(dx, dy, &w1, &h1); + + // check the tiny chars limit + if (!globalParams->getTextKeepTinyChars() && + fabs(w1) < 3 && fabs(h1) < 3) { + if (++nTinyChars > 50000) { + return; + } + } + + // break words at space character + if (uLen == 1 && u[0] == (Unicode)0x20) { + ++curWord->charLen; + ++charPos; + endWord(); + return; + } + + // start a new word if: + // (1) this character's baseline doesn't match the current word's + // baseline, or + // (2) there is space between the end of the current word and this + // character, or + // (3) this character overlaps the previous one (duplicated text), or + // (4) the previous character was an overlap (we want each duplicated + // characters to be in a word by itself) + base = sp = 0; // make gcc happy + if (curWord->len > 0) { + switch (curWord->rot) { + case 0: + base = y1; + sp = x1 - curWord->xMax; + break; + case 1: + base = x1; + sp = y1 - curWord->yMax; + break; + case 2: + base = y1; + sp = curWord->xMin - x1; + break; + case 3: + base = x1; + sp = curWord->yMin - y1; + break; + } + if (fabs(base - curWord->base) > 0.5 || + sp > minWordBreakSpace * curWord->fontSize || + sp < -minDupBreakOverlap * curWord->fontSize || + lastCharOverlap) { + lastCharOverlap = gTrue; + endWord(); + beginWord(state, x, y); + } else { + lastCharOverlap = gFalse; + } + } else { + lastCharOverlap = gFalse; + } + + // page rotation and/or transform matrices can cause text to be + // drawn in reverse order -- in this case, swap the begin/end + // coordinates and break text into individual chars + if ((curWord->rot == 0 && w1 < 0) || + (curWord->rot == 1 && h1 < 0) || + (curWord->rot == 2 && w1 > 0) || + (curWord->rot == 3 && h1 > 0)) { + endWord(); + beginWord(state, x + dx, y + dy); + x1 += w1; + y1 += h1; + w1 = -w1; + h1 = -h1; + } + + // add the characters to the current word + if (uLen != 0) { + w1 /= uLen; + h1 /= uLen; + } + for (i = 0; i < uLen; ++i) { + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); + } + ++curWord->charLen; + ++charPos; +} + +void TextPage::endWord() { + // This check is needed because Type 3 characters can contain + // text-drawing operations (when TextPage is being used via + // {X,Win}SplashOutputDev rather than TextOutputDev). + if (nest > 0) { + --nest; + return; + } + + if (curWord) { + addWord(curWord); + curWord = NULL; + } +} + +void TextPage::addWord(TextWord *word) { + // throw away zero-length words -- they don't have valid xMin/xMax + // values, and they're useless anyway + if (word->len == 0) { + delete word; + return; + } + + if (rawOrder) { + if (rawLastWord) { + rawLastWord->next = word; + } else { + rawWords = word; + } + rawLastWord = word; + } else { + pools[word->rot]->addWord(word); + } +} + +void TextPage::coalesce(GBool physLayout) { + UnicodeMap *uMap; + TextPool *pool; + TextWord *word0, *word1, *word2; + TextLine *line; + TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1; + TextBlock **blkArray; + TextFlow *flow, *lastFlow; + int rot, poolMinBaseIdx, baseIdx, startBaseIdx; + double minBase, maxBase, newMinBase, newMaxBase; + double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace; + GBool found; + int count[4]; + int lrCount; + int firstBlkIdx, nBlocksLeft; + int col1, col2; + int i, j, n; + + if (rawOrder) { + primaryRot = 0; + primaryLR = gTrue; + return; + } + + uMap = globalParams->getTextEncoding(); + blkList = NULL; + lastBlk = NULL; + nBlocks = 0; + primaryRot = -1; + +#if 0 // for debugging + printf("*** initial words ***\n"); + for (rot = 0; rot < 4; ++rot) { + pool = pools[rot]; + for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) { + for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->base, word0->fontSize); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + } + printf("\n"); +#endif + + //----- assemble the blocks + + //~ add an outer loop for writing mode (vertical text) + + // build blocks for each rotation value + for (rot = 0; rot < 4; ++rot) { + pool = pools[rot]; + poolMinBaseIdx = pool->minBaseIdx; + count[rot] = 0; + + // add blocks until no more words are left + while (1) { + + // find the first non-empty line in the pool + for (; + poolMinBaseIdx <= pool->maxBaseIdx && + !pool->getPool(poolMinBaseIdx); + ++poolMinBaseIdx) ; + if (poolMinBaseIdx > pool->maxBaseIdx) { + break; + } + + // look for the left-most word in the first four lines of the + // pool -- this avoids starting with a superscript word + startBaseIdx = poolMinBaseIdx; + for (baseIdx = poolMinBaseIdx + 1; + baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; + ++baseIdx) { + if (!pool->getPool(baseIdx)) { + continue; + } + if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) + < 0) { + startBaseIdx = baseIdx; + } + } + + // create a new block + word0 = pool->getPool(startBaseIdx); + pool->setPool(startBaseIdx, word0->next); + word0->next = NULL; + blk = new TextBlock(this, rot); + blk->addWord(word0); + + fontSize = word0->fontSize; + minBase = maxBase = word0->base; + colSpace1 = minColSpacing1 * fontSize; + colSpace2 = minColSpacing2 * fontSize; + lineSpace = maxLineSpacingDelta * fontSize; + intraLineSpace = maxIntraLineDelta * fontSize; + + // add words to the block + do { + found = gFalse; + + // look for words on the line above the current top edge of + // the block + newMinBase = minBase; + for (baseIdx = pool->getBaseIdx(minBase); + baseIdx >= pool->getBaseIdx(minBase - lineSpace); + --baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base < minBase && + word1->base >= minBase - lineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) + : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta1 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + found = gTrue; + newMinBase = word2->base; + } else { + word0 = word1; + word1 = word1->next; + } + } + } + minBase = newMinBase; + + // look for words on the line below the current bottom edge of + // the block + newMaxBase = maxBase; + for (baseIdx = pool->getBaseIdx(maxBase); + baseIdx <= pool->getBaseIdx(maxBase + lineSpace); + ++baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base > maxBase && + word1->base <= maxBase + lineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) + : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta1 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + found = gTrue; + newMaxBase = word2->base; + } else { + word0 = word1; + word1 = word1->next; + } + } + } + maxBase = newMaxBase; + + // look for words that are on lines already in the block, and + // that overlap the block horizontally + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin < blk->xMax + colSpace1 && + word1->xMax > blk->xMin - colSpace1) + : (word1->yMin < blk->yMax + colSpace1 && + word1->yMax > blk->yMin - colSpace1)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta2 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + found = gTrue; + } else { + word0 = word1; + word1 = word1->next; + } + } + } + + // only check for outlying words (the next two chunks of code) + // if we didn't find anything else + if (found) { + continue; + } + + // scan down the left side of the block, looking for words + // that are near (but not overlapping) the block; if there are + // three or fewer, add them to the block + n = 0; + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMax <= blk->xMin && + word1->xMax > blk->xMin - colSpace2) + : (word1->yMax <= blk->yMin && + word1->yMax > blk->yMin - colSpace2)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta3 * fontSize) { + ++n; + break; + } + word1 = word1->next; + } + } + if (n > 0 && n <= 3) { + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMax <= blk->xMin && + word1->xMax > blk->xMin - colSpace2) + : (word1->yMax <= blk->yMin && + word1->yMax > blk->yMin - colSpace2)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta3 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + if (word2->base < minBase) { + minBase = word2->base; + } else if (word2->base > maxBase) { + maxBase = word2->base; + } + found = gTrue; + break; + } else { + word0 = word1; + word1 = word1->next; + } + } + } + } + + // scan down the right side of the block, looking for words + // that are near (but not overlapping) the block; if there are + // three or fewer, add them to the block + n = 0; + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin >= blk->xMax && + word1->xMin < blk->xMax + colSpace2) + : (word1->yMin >= blk->yMax && + word1->yMin < blk->yMax + colSpace2)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta3 * fontSize) { + ++n; + break; + } + word1 = word1->next; + } + } + if (n > 0 && n <= 3) { + for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); + baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); + ++baseIdx) { + word0 = NULL; + word1 = pool->getPool(baseIdx); + while (word1) { + if (word1->base >= minBase - intraLineSpace && + word1->base <= maxBase + intraLineSpace && + ((rot == 0 || rot == 2) + ? (word1->xMin >= blk->xMax && + word1->xMin < blk->xMax + colSpace2) + : (word1->yMin >= blk->yMax && + word1->yMin < blk->yMax + colSpace2)) && + fabs(word1->fontSize - fontSize) < + maxBlockFontSizeDelta3 * fontSize) { + word2 = word1; + if (word0) { + word0->next = word1->next; + } else { + pool->setPool(baseIdx, word1->next); + } + word1 = word1->next; + word2->next = NULL; + blk->addWord(word2); + if (word2->base < minBase) { + minBase = word2->base; + } else if (word2->base > maxBase) { + maxBase = word2->base; + } + found = gTrue; + break; + } else { + word0 = word1; + word1 = word1->next; + } + } + } + } + + } while (found); + + //~ need to compute the primary writing mode (horiz/vert) in + //~ addition to primary rotation + + // coalesce the block, and add it to the list + blk->coalesce(uMap); + if (lastBlk) { + lastBlk->next = blk; + } else { + blkList = blk; + } + lastBlk = blk; + count[rot] += blk->charCount; + if (primaryRot < 0 || count[rot] > count[primaryRot]) { + primaryRot = rot; + } + ++nBlocks; + } + } + +#if 0 // for debugging + printf("*** rotation ***\n"); + for (rot = 0; rot < 4; ++rot) { + printf(" %d: %6d\n", rot, count[rot]); + } + printf(" primary rot = %d\n", primaryRot); + printf("\n"); +#endif + +#if 0 // for debugging + printf("*** blocks ***\n"); + for (blk = blkList; blk; blk = blk->next) { + printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n", + blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax); + for (line = blk->lines; line; line = line->next) { + printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n", + line->xMin, line->xMax, line->yMin, line->yMax, line->base); + for (word0 = line->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->base, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + } + printf("\n"); +#endif + + // determine the primary direction + lrCount = 0; + for (blk = blkList; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + for (word0 = line->words; word0; word0 = word0->next) { + for (i = 0; i < word0->len; ++i) { + if (unicodeTypeL(word0->text[i])) { + ++lrCount; + } else if (unicodeTypeR(word0->text[i])) { + --lrCount; + } + } + } + } + } + primaryLR = lrCount >= 0; + +#if 0 // for debugging + printf("*** direction ***\n"); + printf("lrCount = %d\n", lrCount); + printf("primaryLR = %d\n", primaryLR); +#endif + + //----- column assignment + + // sort blocks into xy order for column assignment + blocks = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *)); + for (blk = blkList, i = 0; blk; blk = blk->next, ++i) { + blocks[i] = blk; + } + qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot); + + // column assignment + for (i = 0; i < nBlocks; ++i) { + blk0 = blocks[i]; + col1 = 0; + for (j = 0; j < i; ++j) { + blk1 = blocks[j]; + col2 = 0; // make gcc happy + switch (primaryRot) { + case 0: + if (blk0->xMin > blk1->xMax) { + col2 = blk1->col + blk1->nColumns + 3; + } else { + col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / + (blk1->xMax - blk1->xMin)) * + blk1->nColumns); + } + break; + case 1: + if (blk0->yMin > blk1->yMax) { + col2 = blk1->col + blk1->nColumns + 3; + } else { + col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / + (blk1->yMax - blk1->yMin)) * + blk1->nColumns); + } + break; + case 2: + if (blk0->xMax < blk1->xMin) { + col2 = blk1->col + blk1->nColumns + 3; + } else { + col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / + (blk1->xMin - blk1->xMax)) * + blk1->nColumns); + } + break; + case 3: + if (blk0->yMax < blk1->yMin) { + col2 = blk1->col + blk1->nColumns + 3; + } else { + col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / + (blk1->yMin - blk1->yMax)) * + blk1->nColumns); + } + break; + } + if (col2 > col1) { + col1 = col2; + } + } + blk0->col = col1; + for (line = blk0->lines; line; line = line->next) { + for (j = 0; j <= line->len; ++j) { + line->col[j] += col1; + } + } + } + +#if 0 // for debugging + printf("*** blocks, after column assignment ***\n"); + for (blk = blkList; blk; blk = blk->next) { + printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n", + blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col, + blk->nColumns); + for (line = blk->lines; line; line = line->next) { + printf(" line:\n"); + for (word0 = line->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->base, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + } + printf("\n"); +#endif + + //----- reading order sort + + // sort blocks into yx order (in preparation for reading order sort) + qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot); + + // compute space on left and right sides of each block + for (i = 0; i < nBlocks; ++i) { + blk0 = blocks[i]; + for (j = 0; j < nBlocks; ++j) { + blk1 = blocks[j]; + if (blk1 != blk0) { + blk0->updatePriMinMax(blk1); + } + } + } + +#if 0 // for debugging + printf("*** blocks, after yx sort ***\n"); + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; + printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n", + blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, + blk->priMin, blk->priMax); + for (line = blk->lines; line; line = line->next) { + printf(" line:\n"); + for (word0 = line->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->base, word0->fontSize, word0->spaceAfter); + for (j = 0; j < word0->len; ++j) { + fputc(word0->text[j] & 0xff, stdout); + } + printf("'\n"); + } + } + } + printf("\n"); +#endif + + // build the flows + //~ this needs to be adjusted for writing mode (vertical text) + //~ this also needs to account for right-to-left column ordering + blkArray = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *)); + memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *)); + flows = lastFlow = NULL; + firstBlkIdx = 0; + nBlocksLeft = nBlocks; + while (nBlocksLeft > 0) { + + // find the upper-left-most block + for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ; + i = firstBlkIdx; + blk = blkArray[i]; + for (j = firstBlkIdx + 1; j < nBlocks; ++j) { + blk1 = blkArray[j]; + if (blk1) { + if (blk && blk->secondaryDelta(blk1) > 0) { + break; + } + if (blk1->primaryCmp(blk) < 0) { + i = j; + blk = blk1; + } + } + } + blkArray[i] = NULL; + --nBlocksLeft; + blk->next = NULL; + + // create a new flow, starting with the upper-left-most block + flow = new TextFlow(this, blk); + if (lastFlow) { + lastFlow->next = flow; + } else { + flows = flow; + } + lastFlow = flow; + fontSize = blk->lines->words->fontSize; + + // push the upper-left-most block on the stack + blk->stackNext = NULL; + blkStack = blk; + + // find the other blocks in this flow + while (blkStack) { + + // find the upper-left-most block under (but within + // maxBlockSpacing of) the top block on the stack + blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize; + blk = NULL; + i = -1; + for (j = firstBlkIdx; j < nBlocks; ++j) { + blk1 = blkArray[j]; + if (blk1) { + if (blkStack->secondaryDelta(blk1) > blkSpace) { + break; + } + if (blk && blk->secondaryDelta(blk1) > 0) { + break; + } + if (blk1->isBelow(blkStack) && + (!blk || blk1->primaryCmp(blk) < 0)) { + i = j; + blk = blk1; + } } } - } else if (c <= 505) { - c1 = 0xa3bd + (c - 503); - } else if (c <= 594) { - c1 = 0; - } else if (c <= 5995) { - if (c == 2431) { - c1 = 0xacfe; - } else if (c == 4308) { - c1 = 0xbe52; - } else if (c == 5221) { - c1 = 0xc2cb; - } else if (c == 5495) { - c1 = 0xc456; - } else if (c == 5550) { - c1 = 0xc3ba; - } else if (c == 5551) { - c1 = 0xc3b9; + + // if a suitable block was found, add it to the flow and push it + // onto the stack + if (blk && flow->blockFits(blk, blkStack)) { + blkArray[i] = NULL; + --nBlocksLeft; + blk->next = NULL; + flow->addBlock(blk); + fontSize = blk->lines->words->fontSize; + blk->stackNext = blkStack; + blkStack = blk; + + // otherwise (if there is no block under the top block or the + // block is not suitable), pop the stack } else { - if (c >= 2007 && c <= 2430) { - t1 = c - 594; - } else if (c >= 4309 && c <= 4695) { - t1 = c - 596; - } else if (c >= 5222 && c <= 5410) { - t1 = c - 596; - } else if (c >= 5496 && c <= 5641) { - t1 = c - 596; - } else { - t1 = c - 595; - } - t2 = t1 % 157; - t1 /= 157; - if (t2 <= 62) { - c1 = 0xa440 + (t1 << 8) + t2; - } else { - c1 = 0xa462 + (t1 << 8) + t2; - } + blkStack = blkStack->stackNext; } - } else if (c <= 13645) { - if (c == 6039) { - c1 = 0xc9be; - } else if (c == 6134) { - c1 = 0xcaf7; - } else if (c == 8142) { - c1 = 0xdadf; - } else if (c == 8788) { - c1 = 0xd6cc; - } else if (c == 8889) { - c1 = 0xd77a; - } else if (c == 10926) { - c1 = 0xebf1; - } else if (c == 11073) { - c1 = 0xecde; - } else if (c == 11361) { - c1 = 0xf0cb; - } else if (c == 11719) { - c1 = 0xf056; - } else if (c == 12308) { - c1 = 0xeeeb; - } else if (c == 12526) { - c1 = 0xf4b5; - } else if (c == 12640) { - c1 = 0xf16b; - } else if (c == 12783) { - c1 = 0xf268; - } else if (c == 12900) { - c1 = 0xf663; - } else if (c == 13585) { - c1 = 0xf9c4; - } else if (c == 13641) { - c1 = 0xf9c6; - } else { - if (c >= 6006 && c <= 6038) { - t1 = c - 5995; - } else if (c >= 6088 && c <= 6133) { - t1 = c - 5995; - } else if (c >= 6302 && c <= 8250) { - t1 = c - 5995; - } else if (c >= 8251 && c <= 8888) { - t1 = c - 5994; - } else if (c >= 8890 && c <= 9288) { - t1 = c - 5995; - } else if (c >= 9289 && c <= 10925) { - t1 = c - 5994; - } else if (c >= 10927 && c <= 11072) { - t1 = c - 5995; - } else if (c >= 11362 && c <= 11477) { - t1 = c - 5997; - } else if (c >= 11615 && c <= 11718) { - t1 = c - 5995; - } else if (c >= 11942 && c <= 12139) { - t1 = c - 5995; - } else if (c >= 12140 && c <= 12221) { - t1 = c - 5994; - } else if (c >= 12222 && c <= 12307) { - t1 = c - 5993; - } else if (c >= 12309 && c <= 12316) { - t1 = c - 5994; - } else if (c >= 12317 && c <= 12469) { - t1 = c - 5993; - } else if (c >= 12470 && c <= 12525) { - t1 = c - 5992; - } else if (c >= 12527 && c <= 12639) { - t1 = c - 5993; - } else if (c >= 12641 && c <= 12782) { - t1 = c - 5994; - } else if (c >= 12784 && c <= 12828) { - t1 = c - 5995; - } else if (c >= 12829 && c <= 12899) { - t1 = c - 5994; - } else if (c >= 12901 && c <= 13094) { - t1 = c - 5995; - } else if (c >= 13095 && c <= 13584) { - t1 = c - 5994; - } else if (c >= 13586 && c <= 13628) { - t1 = c - 5995; - } else if (c == 13629) { - t1 = c - 5994; - } else if (c >= 13630 && c <= 13640) { - t1 = c - 5993; - } else if (c >= 13642 && c <= 13645) { - t1 = c - 5994; - } else { - t1 = c - 5996; - } - t2 = t1 % 157; - t1 /= 157; - if (t2 <= 62) { - c1 = 0xc940 + (t1 << 8) + t2; - } else { - c1 = 0xc962 + (t1 << 8) + t2; + } + } + gfree(blkArray); + +#if 0 // for debugging + printf("*** flows ***\n"); + for (flow = flows; flow; flow = flow->next) { + printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n", + flow->xMin, flow->xMax, flow->yMin, flow->yMax, + flow->priMin, flow->priMax); + for (blk = flow->blocks; blk; blk = blk->next) { + printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n", + blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, + blk->priMin, blk->priMax); + for (line = blk->lines; line; line = line->next) { + printf(" line:\n"); + for (word0 = line->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->base, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); } } - } else if (c == 13646) { - c1 = 0xa14b; - } else if (c == 13647) { - c1 = 0xa1e3; - } else if (c <= 13742) { - c1 = cns13Map2[c - 13648]; - } else if (c <= 13746) { - c1 = 0xa159 + (c - 13743); - } else if (c <= 14055) { - c1 = 0; - } else if (c <= 14062) { - c1 = 0xf9d6 + (c - 14056); - } -#if 1 //~ - if (c1 == 0) { - error(-1, "Unsupported Adobe-CNS1-3 character: %d", c); } -#endif -#endif - break; } + printf("\n"); +#endif - // append converted character to string - if (c1 == 0) { - text->append(' '); - n = 1; - } else if (c1 > 0) { - text->append(c1 >> 8); - text->append(c1 & 0xff); - n = 2; - } else { - n = 0; - for (q = sub; *q; ++q) { - text->append(*q >> 8); - text->append(*q & 0xff); - n += 2; - } + if (uMap) { + uMap->decRefCnt(); } +} + +GBool TextPage::findText(Unicode *s, int len, + GBool startAtTop, GBool stopAtBottom, + GBool startAtLast, GBool stopAtLast, + double *xMin, double *yMin, + double *xMax, double *yMax) { + TextBlock *blk; + TextLine *line; + Unicode *p; + Unicode u1, u2; + int m, i, j, k; + double xStart, yStart, xStop, yStop; + double xMin0, yMin0, xMax0, yMax0; + double xMin1, yMin1, xMax1, yMax1; + GBool found; + + //~ needs to handle right-to-left text - // update position information - if (i+n > ((i+15) & ~15)) { - xRight = (double *)grealloc(xRight, ((i+n+15) & ~15) * sizeof(double)); + if (rawOrder) { + return gFalse; } - if (i == 0) { - xMin = x; + + xStart = yStart = xStop = yStop = 0; + if (startAtLast && haveLastFind) { + xStart = lastFindXMin; + yStart = lastFindYMin; + } else if (!startAtTop) { + xStart = *xMin; + yStart = *yMin; } - for (j = 0; j < n; ++j) { - xRight[i+j] = x + dx; + if (stopAtLast && haveLastFind) { + xStop = lastFindXMin; + yStop = lastFindYMin; + } else if (!stopAtBottom) { + xStop = *xMax; + yStop = *yMax; } - xMax = x + dx; -} -//------------------------------------------------------------------------ -// TextPage -//------------------------------------------------------------------------ + found = gFalse; + xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy + xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy -TextPage::TextPage(TextOutputCharSet charSet, GBool rawOrder) { - this->charSet = charSet; - this->rawOrder = rawOrder; - curStr = NULL; - yxStrings = NULL; - xyStrings = NULL; - yxCur1 = yxCur2 = NULL; - nest = 0; -} + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; -TextPage::~TextPage() { - clear(); -} + // check: is the block above the top limit? + if (!startAtTop && blk->yMax < yStart) { + continue; + } -void TextPage::beginString(GfxState *state, GString *s, GBool hexCodes) { - // This check is needed because Type 3 characters can contain - // text-drawing operations. - if (curStr) { - ++nest; - return; - } + // check: is the block below the bottom limit? + if (!stopAtBottom && blk->yMin > yStop) { + break; + } - curStr = new TextString(state, hexCodes); -} + for (line = blk->lines; line; line = line->next) { -void TextPage::addChar(GfxState *state, double x, double y, - double dx, double dy, Guchar c) { - double x1, y1, w1, h1, dx2, dy2; - int n; - GBool hexCodes; + // check: is the line above the top limit? + if (!startAtTop && line->yMin < yStart) { + continue; + } - state->transform(x, y, &x1, &y1); - state->textTransformDelta(state->getCharSpace(), 0, &dx2, &dy2); - dx -= dx2; - dy -= dy2; - state->transformDelta(dx, dy, &w1, &h1); - n = curStr->text->getLength(); - if (n > 0 && - x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) { - hexCodes = curStr->hexCodes; - endString(); - beginString(state, NULL, hexCodes); - } - curStr->addChar(state, x1, y1, w1, h1, c, charSet); -} + // check: is the line below the bottom limit? + if (!stopAtBottom && line->yMin > yStop) { + continue; + } -void TextPage::addChar16(GfxState *state, double x, double y, - double dx, double dy, int c, - GfxFontCharSet16 charSet) { - double x1, y1, w1, h1, dx2, dy2; - int n; - GBool hexCodes; + // search each position in this line + m = line->len; + for (j = 0, p = line->text; j <= m - len; ++j, ++p) { + + // compare the strings + for (k = 0; k < len; ++k) { +#if 1 //~ this lowercases Latin A-Z only -- this will eventually be + //~ extended to handle other character sets + if (p[k] >= 0x41 && p[k] <= 0x5a) { + u1 = p[k] + 0x20; + } else { + u1 = p[k]; + } + if (s[k] >= 0x41 && s[k] <= 0x5a) { + u2 = s[k] + 0x20; + } else { + u2 = s[k]; + } +#endif + if (u1 != u2) { + break; + } + } - state->transform(x, y, &x1, &y1); - state->textTransformDelta(state->getCharSpace(), 0, &dx2, &dy2); - dx -= dx2; - dy -= dy2; - state->transformDelta(dx, dy, &w1, &h1); - n = curStr->text->getLength(); - if (n > 0 && - x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) { - hexCodes = curStr->hexCodes; - endString(); - beginString(state, NULL, hexCodes); + // found it + if (k == len) { + switch (line->rot) { + case 0: + xMin1 = line->edge[j]; + xMax1 = line->edge[j + len]; + yMin1 = line->yMin; + yMax1 = line->yMax; + break; + case 1: + xMin1 = line->xMin; + xMax1 = line->xMax; + yMin1 = line->edge[j]; + yMax1 = line->edge[j + len]; + break; + case 2: + xMin1 = line->edge[j + len]; + xMax1 = line->edge[j]; + yMin1 = line->yMin; + yMax1 = line->yMax; + break; + case 3: + xMin1 = line->xMin; + xMax1 = line->xMax; + yMin1 = line->edge[j + len]; + yMax1 = line->edge[j]; + break; + } + if ((startAtTop || + yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && + (stopAtBottom || + yMin1 < yStop || (yMin1 == yStop && xMin1 < yStop))) { + if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) { + xMin0 = xMin1; + xMax0 = xMax1; + yMin0 = yMin1; + yMax0 = yMax1; + found = gTrue; + } + } + } + } + } } - curStr->addChar16(state, x1, y1, w1, h1, c, charSet); + + if (found) { + *xMin = xMin0; + *xMax = xMax0; + *yMin = yMin0; + *yMax = yMax0; + lastFindXMin = xMin0; + lastFindYMin = yMin0; + haveLastFind = gTrue; + return gTrue; + } + + return gFalse; } -void TextPage::endString() { - TextString *p1, *p2; - double h, y1, y2; +GString *TextPage::getText(double xMin, double yMin, + double xMax, double yMax) { + GString *s; + UnicodeMap *uMap; + GBool isUnicode; + TextBlock *blk; + TextLine *line; + TextLineFrag *frags; + int nFrags, fragsSize; + TextLineFrag *frag; + char space[8], eol[16]; + int spaceLen, eolLen; + int lastRot; + double x, y; + int col, idx0, idx1, i, j; + GBool multiLine, oneRot; - if (nest > 0) { - --nest; - return; + s = new GString(); + + if (rawOrder) { + return s; } - // throw away zero-length strings -- they don't have valid xMin/xMax - // values, and they're useless anyway - if (curStr->text->getLength() == 0) { - delete curStr; - curStr = NULL; - return; + // get the output encoding + if (!(uMap = globalParams->getTextEncoding())) { + return s; + } + isUnicode = uMap->isUnicode(); + spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); + eolLen = 0; // make gcc happy + switch (globalParams->getTextEOL()) { + case eolUnix: + eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); + break; + case eolDOS: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); + break; + case eolMac: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + break; } - // insert string in y-major list - h = curStr->yMax - curStr->yMin; - y1 = curStr->yMin + 0.5 * h; - y2 = curStr->yMin + 0.8 * h; - if (rawOrder) { - p1 = yxCur1; - p2 = NULL; - } else if ((!yxCur1 || - (y1 >= yxCur1->yMin && - (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) && - (!yxCur2 || - (y1 < yxCur2->yMin || - (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) { - p1 = yxCur1; - p2 = yxCur2; - } else { - for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) { - if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin)) - break; + //~ writing mode (horiz/vert) + + // collect the line fragments that are in the rectangle + fragsSize = 256; + frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag)); + nFrags = 0; + lastRot = -1; + oneRot = gTrue; + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; + if (xMin < blk->xMax && blk->xMin < xMax && + yMin < blk->yMax && blk->yMin < yMax) { + for (line = blk->lines; line; line = line->next) { + if (xMin < line->xMax && line->xMin < xMax && + yMin < line->yMax && line->yMin < yMax) { + idx0 = idx1 = -1; + switch (line->rot) { + case 0: + y = 0.5 * (line->yMin + line->yMax); + if (yMin < y && y < yMax) { + j = 0; + while (j < line->len) { + if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) { + idx0 = j; + break; + } + ++j; + } + j = line->len - 1; + while (j >= 0) { + if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) { + idx1 = j; + break; + } + --j; + } + } + break; + case 1: + x = 0.5 * (line->xMin + line->xMax); + if (xMin < x && x < xMax) { + j = 0; + while (j < line->len) { + if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) { + idx0 = j; + break; + } + ++j; + } + j = line->len - 1; + while (j >= 0) { + if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) { + idx1 = j; + break; + } + --j; + } + } + break; + case 2: + y = 0.5 * (line->yMin + line->yMax); + if (yMin < y && y < yMax) { + j = 0; + while (j < line->len) { + if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) { + idx0 = j; + break; + } + ++j; + } + j = line->len - 1; + while (j >= 0) { + if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) { + idx1 = j; + break; + } + --j; + } + } + break; + case 3: + x = 0.5 * (line->xMin + line->xMax); + if (xMin < x && x < xMax) { + j = 0; + while (j < line->len) { + if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) { + idx0 = j; + break; + } + ++j; + } + j = line->len - 1; + while (j >= 0) { + if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) { + idx1 = j; + break; + } + --j; + } + } + break; + } + if (idx0 >= 0 && idx1 >= 0) { + if (nFrags == fragsSize) { + fragsSize *= 2; + frags = (TextLineFrag *) + grealloc(frags, fragsSize * sizeof(TextLineFrag)); + } + frags[nFrags].init(line, idx0, idx1 - idx0 + 1); + ++nFrags; + if (lastRot >= 0 && line->rot != lastRot) { + oneRot = gFalse; + } + lastRot = line->rot; + } + } + } } - yxCur2 = p2; } - yxCur1 = curStr; - if (p1) - p1->yxNext = curStr; - else - yxStrings = curStr; - curStr->yxNext = p2; - curStr = NULL; -} -void TextPage::coalesce() { - TextString *str1, *str2; - double space, d; - int n, i; + // sort the fragments and generate the string + if (nFrags > 0) { -#if 0 //~ for debugging - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - printf("x=%3d..%3d y=%3d..%3d size=%2d '%s'\n", - (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax, - (int)(str1->yMax - str1->yMin), str1->text->getCString()); - } - printf("\n------------------------------------------------------------\n\n"); -#endif - str1 = yxStrings; - while (str1 && (str2 = str1->yxNext)) { - space = str1->yMax - str1->yMin; - d = str2->xMin - str1->xMax; - if (((rawOrder && - ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) || - (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) || - (!rawOrder && str2->yMin < str1->yMax)) && - d > -0.5 * space && d < space) { - n = str1->text->getLength(); - if (d > 0.1 * space) - str1->text->append(' '); - str1->text->append(str2->text); - str1->xRight = (double *) - grealloc(str1->xRight, - ((str1->text->getLength() + 15) & ~15) * sizeof(double)); - if (d > 0.1 * space) - str1->xRight[n++] = str2->xMin; - for (i = 0; i < str2->text->getLength(); ++i) - str1->xRight[n++] = str2->xRight[i]; - if (str2->xMax > str1->xMax) - str1->xMax = str2->xMax; - if (str2->yMax > str1->yMax) - str1->yMax = str2->yMax; - str1->yxNext = str2->yxNext; - delete str2; + for (i = 0; i < nFrags; ++i) { + frags[i].computeCoords(oneRot); + } + assignColumns(frags, nFrags, oneRot); + + // if all lines in the region have the same rotation, use it; + // otherwise, use the page's primary rotation + if (oneRot) { + qsort(frags, nFrags, sizeof(TextLineFrag), + &TextLineFrag::cmpYXLineRot); } else { - str1 = str2; + qsort(frags, nFrags, sizeof(TextLineFrag), + &TextLineFrag::cmpYXPrimaryRot); } - } -} -GBool TextPage::findText(char *s, GBool top, GBool bottom, - double *xMin, double *yMin, - double *xMax, double *yMax) { - TextString *str; - char *p, *p1, *q; - int n, m, i; - double x; - - // scan all strings on page - n = strlen(s); - for (str = yxStrings; str; str = str->yxNext) { - - // check: above top limit? - if (!top && (str->yMax < *yMin || - (str->yMin < *yMin && str->xMax <= *xMin))) - continue; + col = 0; + multiLine = gFalse; + for (i = 0; i < nFrags; ++i) { + frag = &frags[i]; + + // insert a return + if (frag->col < col || + (i > 0 && fabs(frag->base - frags[i-1].base) > + maxIntraLineDelta * frags[i-1].line->words->fontSize)) { + s->append(eol, eolLen); + col = 0; + multiLine = gTrue; + } - // check: below bottom limit? - if (!bottom && (str->yMin > *yMax || - (str->yMax > *yMax && str->xMin >= *xMax))) - return gFalse; + // column alignment + for (; col < frag->col; ++col) { + s->append(space, spaceLen); + } - // search each position in this string - m = str->text->getLength(); - for (i = 0, p = str->text->getCString(); i <= m - n; ++i, ++p) { + // get the fragment text + col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); + } - // check: above top limit? - if (!top && str->yMin < *yMin) { - x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; - if (x < *xMin) - continue; - } + if (multiLine) { + s->append(eol, eolLen); + } + } - // check: below bottom limit? - if (!bottom && str->yMax > *yMax) { - x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; - if (x > *xMax) - return gFalse; - } + gfree(frags); + uMap->decRefCnt(); - // compare the strings - for (p1 = p, q = s; *q; ++p1, ++q) { - if (tolower(*p1) != tolower(*q)) - break; - } + return s; +} + +GBool TextPage::findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax) { + TextBlock *blk; + TextLine *line; + TextWord *word; + double xMin0, xMax0, yMin0, yMax0; + double xMin1, xMax1, yMin1, yMax1; + GBool first; + int i, j0, j1; - // found it - if (!*q) { - *xMin = (i == 0) ? str->xMin : str->xRight[i-1]; - *xMax = str->xRight[i+n-1]; - *yMin = str->yMin; - *yMax = str->yMax; - return gTrue; + if (rawOrder) { + return gFalse; + } + + //~ this doesn't correctly handle: + //~ - ranges split across multiple lines (the highlighted region + //~ is the bounding box of all the parts of the range) + //~ - cases where characters don't convert one-to-one into Unicode + first = gTrue; + xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy + xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; + for (line = blk->lines; line; line = line->next) { + for (word = line->words; word; word = word->next) { + if (pos < word->charPos + word->charLen && + word->charPos < pos + length) { + j0 = pos - word->charPos; + if (j0 < 0) { + j0 = 0; + } + j1 = pos + length - 1 - word->charPos; + if (j1 >= word->len) { + j1 = word->len - 1; + } + switch (line->rot) { + case 0: + xMin1 = word->edge[j0]; + xMax1 = word->edge[j1 + 1]; + yMin1 = word->yMin; + yMax1 = word->yMax; + break; + case 1: + xMin1 = word->xMin; + xMax1 = word->xMax; + yMin1 = word->edge[j0]; + yMax1 = word->edge[j1 + 1]; + break; + case 2: + xMin1 = word->edge[j1 + 1]; + xMax1 = word->edge[j0]; + yMin1 = word->yMin; + yMax1 = word->yMax; + break; + case 3: + xMin1 = word->xMin; + xMax1 = word->xMax; + yMin1 = word->edge[j1 + 1]; + yMax1 = word->edge[j0]; + break; + } + if (first || xMin1 < xMin0) { + xMin0 = xMin1; + } + if (first || xMax1 > xMax0) { + xMax0 = xMax1; + } + if (first || yMin1 < yMin0) { + yMin0 = yMin1; + } + if (first || yMax1 > yMax0) { + yMax0 = yMax1; + } + first = gFalse; + } } } } + if (!first) { + *xMin = xMin0; + *xMax = xMax0; + *yMin = yMin0; + *yMax = yMax0; + return gTrue; + } return gFalse; } -GString *TextPage::getText(double xMin, double yMin, - double xMax, double yMax) { +void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, + GBool physLayout) { + UnicodeMap *uMap; + TextFlow *flow; + TextBlock *blk; + TextLine *line; + TextLineFrag *frags; + TextWord *word; + int nFrags, fragsSize; + TextLineFrag *frag; + char space[8], eol[16], eop[8]; + int spaceLen, eolLen, eopLen; + GBool pageBreaks; GString *s; - TextString *str1; - double x0, x1, x2, y; - double xPrev, yPrev; - int i1, i2; - GBool multiLine; + int col, i, d, n; - s = new GString(); - xPrev = yPrev = 0; - multiLine = gFalse; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - y = 0.5 * (str1->yMin + str1->yMax); - if (y > yMax) - break; - if (y > yMin && str1->xMin < xMax && str1->xMax > xMin) { - x0 = x1 = x2 = str1->xMin; - for (i1 = 0; i1 < str1->text->getLength(); ++i1) { - x0 = (i1==0) ? str1->xMin : str1->xRight[i1-1]; - x1 = str1->xRight[i1]; - if (0.5 * (x0 + x1) >= xMin) - break; + // get the output encoding + if (!(uMap = globalParams->getTextEncoding())) { + return; + } + spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); + eolLen = 0; // make gcc happy + switch (globalParams->getTextEOL()) { + case eolUnix: + eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); + break; + case eolDOS: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); + break; + case eolMac: + eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); + break; + } + eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop)); + pageBreaks = globalParams->getTextPageBreaks(); + + //~ writing mode (horiz/vert) + + // output the page in raw (content stream) order + if (rawOrder) { + + for (word = rawWords; word; word = word->next) { + s = new GString(); + dumpFragment(word->text, word->len, uMap, s); + (*outputFunc)(outputStream, s->getCString(), s->getLength()); + delete s; + if (word->next && + fabs(word->next->base - word->base) < + maxIntraLineDelta * word->fontSize) { + if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) { + (*outputFunc)(outputStream, space, spaceLen); + } + } else { + (*outputFunc)(outputStream, eol, eolLen); } - for (i2 = str1->text->getLength() - 1; i2 > i1; --i2) { - x1 = (i2==0) ? str1->xMin : str1->xRight[i2-1]; - x2 = str1->xRight[i2]; - if (0.5 * (x1 + x2) <= xMax) - break; + } + + // output the page, maintaining the original physical layout + } else if (physLayout) { + + // collect the line fragments for the page and sort them + fragsSize = 256; + frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag)); + nFrags = 0; + for (i = 0; i < nBlocks; ++i) { + blk = blocks[i]; + for (line = blk->lines; line; line = line->next) { + if (nFrags == fragsSize) { + fragsSize *= 2; + frags = (TextLineFrag *)grealloc(frags, + fragsSize * sizeof(TextLineFrag)); + } + frags[nFrags].init(line, 0, line->len); + frags[nFrags].computeCoords(gTrue); + ++nFrags; } - if (s->getLength() > 0) { - if (x0 < xPrev || str1->yMin > yPrev) { -#ifdef MACOS - s->append('\r'); -#else - s->append('\n'); -#endif - multiLine = gTrue; + } + qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot); + + // generate output + col = 0; + for (i = 0; i < nFrags; ++i) { + frag = &frags[i]; + + // column alignment + for (; col < frag->col; ++col) { + (*outputFunc)(outputStream, space, spaceLen); + } + + // print the line + s = new GString(); + col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); + (*outputFunc)(outputStream, s->getCString(), s->getLength()); + delete s; + + // print one or more returns if necessary + if (i == nFrags - 1 || + frags[i+1].col < col || + fabs(frags[i+1].base - frag->base) > + maxIntraLineDelta * frag->line->words->fontSize) { + if (i < nFrags - 1) { + d = (int)((frags[i+1].base - frag->base) / + frag->line->words->fontSize); + if (d < 1) { + d = 1; + } else if (d > 5) { + d = 5; + } } else { - s->append(" "); + d = 1; + } + for (; d > 0; --d) { + (*outputFunc)(outputStream, eol, eolLen); } + col = 0; } - s->append(str1->text->getCString() + i1, i2 - i1 + 1); - xPrev = x2; - yPrev = str1->yMax; } - } - if (multiLine) { -#ifdef MACOS - s->append('\r'); -#else - s->append('\n'); -#endif - } - return s; -} -void TextPage::dump(FILE *f) { - TextString *str1, *str2, *str3; - double yMin, yMax; - int col1, col2; - double d; - - // build x-major list - xyStrings = NULL; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - for (str2 = NULL, str3 = xyStrings; - str3; - str2 = str3, str3 = str3->xyNext) { - if (str1->xMin < str3->xMin || - (str1->xMin == str3->xMin && str1->yMin < str3->yMin)) - break; - } - if (str2) - str2->xyNext = str1; - else - xyStrings = str1; - str1->xyNext = str3; - } + gfree(frags); - // do column assignment - for (str1 = xyStrings; str1; str1 = str1->xyNext) { - col1 = 0; - for (str2 = xyStrings; str2 != str1; str2 = str2->xyNext) { - if (str1->xMin >= str2->xMax) { - col2 = str2->col + str2->text->getLength() + 4; - if (col2 > col1) - col1 = col2; - } else if (str1->xMin > str2->xMin) { - col2 = str2->col + - (int)(((str1->xMin - str2->xMin) / (str2->xMax - str2->xMin)) * - str2->text->getLength()); - if (col2 > col1) { - col1 = col2; + // output the page, "undoing" the layout + } else { + for (flow = flows; flow; flow = flow->next) { + for (blk = flow->blocks; blk; blk = blk->next) { + for (line = blk->lines; line; line = line->next) { + n = line->len; + if (line->hyphenated && (line->next || blk->next)) { + --n; + } + s = new GString(); + dumpFragment(line->text, n, uMap, s); + (*outputFunc)(outputStream, s->getCString(), s->getLength()); + delete s; + if (!line->hyphenated) { + if (line->next) { + (*outputFunc)(outputStream, space, spaceLen); + } else if (blk->next) { + //~ this is a bit of a kludge - we should really do a more + //~ intelligent determination of paragraphs + if (blk->next->lines->words->fontSize == + blk->lines->words->fontSize) { + (*outputFunc)(outputStream, space, spaceLen); + } else { + (*outputFunc)(outputStream, eol, eolLen); + } + } + } } } + (*outputFunc)(outputStream, eol, eolLen); + (*outputFunc)(outputStream, eol, eolLen); } - str1->col = col1; } -#if 0 //~ for debugging - fprintf(f, "~~~~~~~~~~\n"); - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - fprintf(f, "(%4d,%4d) - (%4d,%4d) [%3d] %s\n", - (int)str1->xMin, (int)str1->yMin, (int)str1->xMax, (int)str1->yMax, - str1->col, str1->text->getCString()); + // end of page + if (pageBreaks) { + (*outputFunc)(outputStream, eop, eopLen); + (*outputFunc)(outputStream, eol, eolLen); } - fprintf(f, "~~~~~~~~~~\n"); -#endif - // output - col1 = 0; - yMax = yxStrings ? yxStrings->yMax : 0; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { + uMap->decRefCnt(); +} - // line this string up with the correct column - if (rawOrder && col1 == 0) { - col1 = str1->col; - } else { - for (; col1 < str1->col; ++col1) { - fputc(' ', f); +void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) { + TextLineFrag *frag0, *frag1; + int rot, col1, col2, i, j, k; + + // all text in the region has the same rotation -- recompute the + // column numbers based only on the text in the region + if (oneRot) { + qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot); + rot = frags[0].line->rot; + for (i = 0; i < nFrags; ++i) { + frag0 = &frags[i]; + col1 = 0; + for (j = 0; j < i; ++j) { + frag1 = &frags[j]; + col2 = 0; // make gcc happy + switch (rot) { + case 0: + if (frag0->xMin >= frag1->xMax) { + col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - + frag1->line->col[frag1->start]) + 1; + } else { + for (k = frag1->start; + k < frag1->start + frag1->len && + frag0->xMin >= 0.5 * (frag1->line->edge[k] + + frag1->line->edge[k+1]); + ++k) ; + col2 = frag1->col + + frag1->line->col[k] - frag1->line->col[frag1->start]; + } + break; + case 1: + if (frag0->yMin >= frag1->yMax) { + col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - + frag1->line->col[frag1->start]) + 1; + } else { + for (k = frag1->start; + k < frag1->start + frag1->len && + frag0->yMin >= 0.5 * (frag1->line->edge[k] + + frag1->line->edge[k+1]); + ++k) ; + col2 = frag1->col + + frag1->line->col[k] - frag1->line->col[frag1->start]; + } + break; + case 2: + if (frag0->xMax <= frag1->xMin) { + col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - + frag1->line->col[frag1->start]) + 1; + } else { + for (k = frag1->start; + k < frag1->start + frag1->len && + frag0->xMax <= 0.5 * (frag1->line->edge[k] + + frag1->line->edge[k+1]); + ++k) ; + col2 = frag1->col + + frag1->line->col[k] - frag1->line->col[frag1->start]; + } + break; + case 3: + if (frag0->yMax <= frag1->yMin) { + col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - + frag1->line->col[frag1->start]) + 1; + } else { + for (k = frag1->start; + k < frag1->start + frag1->len && + frag0->yMax <= 0.5 * (frag1->line->edge[k] + + frag1->line->edge[k+1]); + ++k) ; + col2 = frag1->col + + frag1->line->col[k] - frag1->line->col[frag1->start]; + } + break; + } + if (col2 > col1) { + col1 = col2; + } } + frag0->col = col1; } - // print the string - fputs(str1->text->getCString(), f); + // the region includes text at different rotations -- use the + // globally assigned column numbers, offset by the minimum column + // number (i.e., shift everything over to column 0) + } else { + col1 = frags[0].col; + for (i = 1; i < nFrags; ++i) { + if (frags[i].col < col1) { + col1 = frags[i].col; + } + } + for (i = 0; i < nFrags; ++i) { + frags[i].col -= col1; + } + } +} - // increment column - col1 += str1->text->getLength(); +int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, + GString *s) { + char lre[8], rle[8], popdf[8], buf[8]; + int lreLen, rleLen, popdfLen, n; + int nCols, i, j, k; - // update yMax for this line - if (str1->yMax > yMax) - yMax = str1->yMax; + nCols = 0; - // if we've hit the end of the line... - if (!(str1->yxNext && - !(rawOrder && str1->yxNext->yMax < str1->yMin) && - str1->yxNext->yMin < 0.2*str1->yMin + 0.8*str1->yMax && - str1->yxNext->xMin >= str1->xMax)) { + if (uMap->isUnicode()) { - // print a return - fputc('\n', f); + lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); + rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle)); + popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf)); - // print extra vertical space if necessary - if (str1->yxNext) { + if (primaryLR) { - // find yMin for next line - yMin = str1->yxNext->yMin; - for (str2 = str1->yxNext; str2; str2 = str2->yxNext) { - if (str2->yMin < yMin) - yMin = str2->yMin; - if (!(str2->yxNext && str2->yxNext->yMin < str2->yMax && - str2->yxNext->xMin >= str2->xMax)) - break; + i = 0; + while (i < len) { + // output a left-to-right section + for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; + for (k = i; k < j; ++k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; } - - // print the space - d = (int)((yMin - yMax) / (str1->yMax - str1->yMin) + 0.5); - if (rawOrder && d > 2) { - d = 2; + i = j; + // output a right-to-left section + for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ; + if (j > i) { + s->append(rle, rleLen); + for (k = j - 1; k >= i; --k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + s->append(popdf, popdfLen); + i = j; } - for (; d > 0; --d) { - fputc('\n', f); + } + + } else { + + s->append(rle, rleLen); + i = len - 1; + while (i >= 0) { + // output a right-to-left section + for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ; + for (k = i; k > j; --k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + i = j; + // output a left-to-right section + for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ; + if (j < i) { + s->append(lre, lreLen); + for (k = j + 1; k <= i; ++k) { + n = uMap->mapUnicode(text[k], buf, sizeof(buf)); + s->append(buf, n); + ++nCols; + } + s->append(popdf, popdfLen); + i = j; } } + s->append(popdf, popdfLen); - // set up for next line - col1 = 0; - yMax = str1->yxNext ? str1->yxNext->yMax : 0; + } + + } else { + for (i = 0; i < len; ++i) { + n = uMap->mapUnicode(text[i], buf, sizeof(buf)); + s->append(buf, n); + nCols += n; } } -} -void TextPage::clear() { - TextString *p1, *p2; + return nCols; +} - if (curStr) { - delete curStr; - curStr = NULL; - } - for (p1 = yxStrings; p1; p1 = p2) { - p2 = p1->yxNext; - delete p1; - } - yxStrings = NULL; - xyStrings = NULL; - yxCur1 = yxCur2 = NULL; +#if TEXTOUT_WORD_LIST +TextWordList *TextPage::makeWordList(GBool physLayout) { + return new TextWordList(this, physLayout); } +#endif //------------------------------------------------------------------------ // TextOutputDev //------------------------------------------------------------------------ -TextOutputDev::TextOutputDev(char *fileName, TextOutputCharSet charSet, - GBool rawOrder) { +static void outputToFile(void *stream, char *text, int len) { + fwrite(text, 1, len, (FILE *)stream); +} + +TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA, + GBool rawOrderA, GBool append) { text = NULL; - this->rawOrder = rawOrder; + physLayout = physLayoutA; + rawOrder = rawOrderA; ok = gTrue; // open file needClose = gFalse; if (fileName) { if (!strcmp(fileName, "-")) { - f = stdout; - } else if ((f = fopen(fileName, "w"))) { + outputStream = stdout; +#ifdef WIN32 + // keep DOS from munging the end-of-line characters + setmode(fileno(stdout), O_BINARY); +#endif + } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) { needClose = gTrue; } else { error(-1, "Couldn't open text file '%s'", fileName); ok = gFalse; return; } + outputFunc = &outputToFile; } else { - f = NULL; + outputStream = NULL; } // set up text object - text = new TextPage(charSet, rawOrder); + text = new TextPage(rawOrderA); +} + +TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, + GBool physLayoutA, GBool rawOrderA) { + outputFunc = func; + outputStream = stream; + needClose = gFalse; + physLayout = physLayoutA; + rawOrder = rawOrderA; + text = new TextPage(rawOrderA); + ok = gTrue; } TextOutputDev::~TextOutputDev() { if (needClose) { #ifdef MACOS - ICS_MapRefNumAndAssign((short)f->handle); + ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle); #endif - fclose(f); + fclose((FILE *)outputStream); } if (text) { delete text; @@ -1199,73 +3474,56 @@ TextOutputDev::~TextOutputDev() { } void TextOutputDev::startPage(int pageNum, GfxState *state) { - text->clear(); + text->startPage(state); } void TextOutputDev::endPage() { - text->coalesce(); - if (f) { - text->dump(f); - fputc('\n', f); - fputs("\f\n", f); - fputc('\n', f); + text->endPage(); + text->coalesce(physLayout); + if (outputStream) { + text->dump(outputStream, outputFunc, physLayout); } } void TextOutputDev::updateFont(GfxState *state) { - GfxFont *font; - char *charName; - int c; - - // look for hex char codes in subsetted font - hexCodes = gFalse; - if ((font = state->getFont()) && !font->is16Bit()) { - for (c = 0; c < 256; ++c) { - if ((charName = font->getCharName(c))) { - if ((charName[0] == 'B' || charName[0] == 'C' || - charName[0] == 'G') && - strlen(charName) == 3 && - isxdigit(charName[1]) && isxdigit(charName[2]) && - ((charName[1] >= 'a' && charName[1] <= 'f') || - (charName[1] >= 'A' && charName[1] <= 'F') || - (charName[2] >= 'a' && charName[2] <= 'f') || - (charName[2] >= 'A' && charName[2] <= 'F'))) { - hexCodes = gTrue; - break; - } else if ((strlen(charName) == 2) && - isxdigit(charName[0]) && isxdigit(charName[1]) && - ((charName[0] >= 'a' && charName[0] <= 'f') || - (charName[0] >= 'A' && charName[0] <= 'F') || - (charName[1] >= 'a' && charName[1] <= 'f') || - (charName[1] >= 'A' && charName[1] <= 'F'))) { - hexCodes = gTrue; - break; - } - } - } - } + text->updateFont(state); } void TextOutputDev::beginString(GfxState *state, GString *s) { - text->beginString(state, s, hexCodes); } void TextOutputDev::endString(GfxState *state) { - text->endString(); } void TextOutputDev::drawChar(GfxState *state, double x, double y, - double dx, double dy, Guchar c) { - text->addChar(state, x, y, dx, dy, c); -} - -void TextOutputDev::drawChar16(GfxState *state, double x, double y, - double dx, double dy, int c) { - text->addChar16(state, x, y, dx, dy, c, state->getFont()->getCharSet16()); + double dx, double dy, + double originX, double originY, + CharCode c, Unicode *u, int uLen) { + text->addChar(state, x, y, dx, dy, c, u, uLen); } -GBool TextOutputDev::findText(char *s, GBool top, GBool bottom, +GBool TextOutputDev::findText(Unicode *s, int len, + GBool startAtTop, GBool stopAtBottom, + GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax) { - return text->findText(s, top, bottom, xMin, yMin, xMax, yMax); + return text->findText(s, len, startAtTop, stopAtBottom, + startAtLast, stopAtLast, xMin, yMin, xMax, yMax); +} + +GString *TextOutputDev::getText(double xMin, double yMin, + double xMax, double yMax) { + return text->getText(xMin, yMin, xMax, yMax); +} + +GBool TextOutputDev::findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax) { + return text->findCharRange(pos, length, xMin, yMin, xMax, yMax); +} + +#if TEXTOUT_WORD_LIST +TextWordList *TextOutputDev::makeWordList() { + return text->makeWordList(physLayout); } +#endif