1 //========================================================================
5 // Copyright 1997-2003 Glyph & Cog, LLC
7 //========================================================================
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
21 #include <fcntl.h> // for O_BINARY
22 #include <io.h> // for setmode
29 #include "GlobalParams.h"
30 #include "UnicodeMap.h"
31 #include "UnicodeTypeTable.h"
33 #include "TextOutputDev.h"
36 // needed for setting type/creator of MacOS files
37 #include "ICSupport.h"
40 //------------------------------------------------------------------------
42 //------------------------------------------------------------------------
44 // Each bucket in a text pool includes baselines within a range of
46 #define textPoolStep 4
48 // Inter-character space width which will cause addChar to break up a
50 #define defaultSpaceWidth 0.25
52 // Max distance between baselines of two lines within a block, as a
53 // fraction of the font size.
54 #define maxLineSpacingDelta 1.5
56 // Max difference in primary font sizes on two lines in the same
57 // block. Delta1 is used when examining new lines above and below the
58 // current block; delta2 is used when examining text that overlaps the
59 // current block; delta3 is used when examining text to the left and
60 // right of the current block.
61 #define maxBlockFontSizeDelta1 0.05
62 #define maxBlockFontSizeDelta2 0.6
63 #define maxBlockFontSizeDelta3 0.2
65 // Max difference in font sizes inside a word.
66 #define maxWordFontSizeDelta 0.05
68 // Maximum distance between baselines of two words on the same line,
69 // e.g., distance between subscript or superscript and the primary
70 // baseline, as a fraction of the font size.
71 #define maxIntraLineDelta 0.5
73 // Minimum inter-word spacing, as a fraction of the font size. (Only
74 // used for raw ordering.)
75 #define minWordSpacing 0.2
77 // Maximum inter-word spacing, as a fraction of the font size.
78 #define maxWordSpacing 1.5
80 // Minimum spacing between columns, as a fraction of the font size.
81 #define minColSpacing 1.0
83 // Maximum vertical spacing between blocks within a flow, as a
84 // multiple of the font size.
85 #define maxBlockSpacing 2.5
87 // Minimum spacing between characters within a word, as a fraction of
89 #define minCharSpacing -0.2
91 // Maximum spacing between characters within a word, as a fraction of
92 // the font size, when there is no obvious extra-wide character
94 #define maxCharSpacing 0.03
96 // When extra-wide character spacing is detected, the inter-character
97 // space threshold is set to the minimum inter-character space
98 // multiplied by this constant.
99 #define maxWideCharSpacingMul 1.3
101 // Max difference in primary,secondary coordinates (as a fraction of
102 // the font size) allowed for duplicated text (fake boldface, drop
103 // shadows) which is to be discarded.
104 #define dupMaxPriDelta 0.1
105 #define dupMaxSecDelta 0.2
107 //------------------------------------------------------------------------
109 //------------------------------------------------------------------------
111 TextFontInfo::TextFontInfo(GfxState *state) {
112 gfxFont = state->getFont();
113 #if TEXTOUT_WORD_LIST
114 fontName = (gfxFont && gfxFont->getOrigName())
115 ? gfxFont->getOrigName()->copy()
120 TextFontInfo::~TextFontInfo() {
121 #if TEXTOUT_WORD_LIST
128 GBool TextFontInfo::matches(GfxState *state) {
129 return state->getFont() == gfxFont;
132 //------------------------------------------------------------------------
134 //------------------------------------------------------------------------
136 TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
137 int charPosA, TextFontInfo *fontA, double fontSizeA) {
139 double x, y, ascent, descent;
145 fontSize = fontSizeA;
146 state->transform(x0, y0, &x, &y);
147 if ((gfxFont = font->gfxFont)) {
148 ascent = gfxFont->getAscent() * fontSize;
149 descent = gfxFont->getDescent() * fontSize;
151 // this means that the PDF file draws text without a current font,
152 // which should never happen
153 ascent = 0.95 * fontSize;
154 descent = -0.35 * fontSize;
161 // this is a sanity check for a case that shouldn't happen -- but
162 // if it does happen, we want to avoid dividing by zero later
172 // this is a sanity check for a case that shouldn't happen -- but
173 // if it does happen, we want to avoid dividing by zero later
183 // this is a sanity check for a case that shouldn't happen -- but
184 // if it does happen, we want to avoid dividing by zero later
194 // this is a sanity check for a case that shouldn't happen -- but
195 // if it does happen, we want to avoid dividing by zero later
208 #if TEXTOUT_WORD_LIST
211 if ((state->getRender() & 3) == 1) {
212 state->getStrokeRGB(&rgb);
214 state->getFillRGB(&rgb);
222 TextWord::~TextWord() {
227 void TextWord::addChar(GfxState *state, double x, double y,
228 double dx, double dy, Unicode u) {
231 text = (Unicode *)grealloc(text, size * sizeof(Unicode));
232 edge = (double *)grealloc(edge, (size + 1) * sizeof(double));
241 xMax = edge[len+1] = x + dx;
248 yMax = edge[len+1] = y + dy;
255 xMin = edge[len+1] = x + dx;
262 yMin = edge[len+1] = y + dy;
268 void TextWord::merge(TextWord *word) {
271 if (word->xMin < xMin) {
274 if (word->yMin < yMin) {
277 if (word->xMax > xMax) {
280 if (word->yMax > yMax) {
283 if (len + word->len > size) {
284 size = len + word->len;
285 text = (Unicode *)grealloc(text, size * sizeof(Unicode));
286 edge = (double *)grealloc(edge, (size + 1) * sizeof(double));
288 for (i = 0; i < word->len; ++i) {
289 text[len + i] = word->text[i];
290 edge[len + i] = word->edge[i];
292 edge[len + word->len] = word->edge[word->len];
294 charLen += word->charLen;
297 inline int TextWord::primaryCmp(TextWord *word) {
300 cmp = 0; // make gcc happy
303 cmp = xMin - word->xMin;
306 cmp = yMin - word->yMin;
309 cmp = word->xMax - xMax;
312 cmp = word->yMax - yMax;
315 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
318 double TextWord::primaryDelta(TextWord *word) {
321 delta = 0; // make gcc happy
324 delta = word->xMin - xMax;
327 delta = word->yMin - yMax;
330 delta = xMin - word->xMax;
333 delta = yMin - word->yMax;
339 int TextWord::cmpYX(const void *p1, const void *p2) {
340 TextWord *word1 = *(TextWord **)p1;
341 TextWord *word2 = *(TextWord **)p2;
344 cmp = word1->yMin - word2->yMin;
346 cmp = word1->xMin - word2->xMin;
348 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
351 #if TEXTOUT_WORD_LIST
353 GString *TextWord::getText() {
360 if (!(uMap = globalParams->getTextEncoding())) {
363 for (i = 0; i < len; ++i) {
364 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
371 #endif // TEXTOUT_WORD_LIST
373 //------------------------------------------------------------------------
375 //------------------------------------------------------------------------
377 TextPool::TextPool() {
385 TextPool::~TextPool() {
387 TextWord *word, *word2;
389 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
390 for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
398 int TextPool::getBaseIdx(double base) {
401 baseIdx = (int)(base / textPoolStep);
402 if (baseIdx < minBaseIdx) {
405 if (baseIdx > maxBaseIdx) {
411 void TextPool::addWord(TextWord *word) {
413 int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
416 // expand the array if needed
417 wordBaseIdx = (int)(word->base / textPoolStep);
418 if (minBaseIdx > maxBaseIdx) {
419 minBaseIdx = wordBaseIdx - 128;
420 maxBaseIdx = wordBaseIdx + 128;
421 pool = (TextWord **)gmalloc((maxBaseIdx - minBaseIdx + 1) *
423 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
424 pool[baseIdx - minBaseIdx] = NULL;
426 } else if (wordBaseIdx < minBaseIdx) {
427 newMinBaseIdx = wordBaseIdx - 128;
428 newPool = (TextWord **)gmalloc((maxBaseIdx - newMinBaseIdx + 1) *
430 for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
431 newPool[baseIdx - newMinBaseIdx] = NULL;
433 memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
434 (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
437 minBaseIdx = newMinBaseIdx;
438 } else if (wordBaseIdx > maxBaseIdx) {
439 newMaxBaseIdx = wordBaseIdx + 128;
440 pool = (TextWord **)grealloc(pool, (newMaxBaseIdx - minBaseIdx + 1) *
442 for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
443 pool[baseIdx - minBaseIdx] = NULL;
445 maxBaseIdx = newMaxBaseIdx;
448 // insert the new word
449 if (cursor && wordBaseIdx == cursorBaseIdx &&
450 word->primaryCmp(cursor) > 0) {
455 w1 = pool[wordBaseIdx - minBaseIdx];
457 for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
462 pool[wordBaseIdx - minBaseIdx] = word;
465 cursorBaseIdx = wordBaseIdx;
468 //------------------------------------------------------------------------
470 //------------------------------------------------------------------------
472 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
478 words = lastWord = NULL;
488 TextLine::~TextLine() {
501 void TextLine::addWord(TextWord *word) {
503 lastWord->next = word;
515 if (word->xMin < xMin) {
518 if (word->xMax > xMax) {
521 if (word->yMin < yMin) {
524 if (word->yMax > yMax) {
530 double TextLine::primaryDelta(TextLine *line) {
533 delta = 0; // make gcc happy
536 delta = line->xMin - xMax;
539 delta = line->yMin - yMax;
542 delta = xMin - line->xMax;
545 delta = yMin - line->yMax;
551 int TextLine::primaryCmp(TextLine *line) {
554 cmp = 0; // make gcc happy
557 cmp = xMin - line->xMin;
560 cmp = yMin - line->yMin;
563 cmp = line->xMax - xMax;
566 cmp = line->yMax - yMax;
569 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
572 int TextLine::secondaryCmp(TextLine *line) {
575 cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
576 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
579 int TextLine::cmpYX(TextLine *line) {
582 if ((cmp = secondaryCmp(line))) {
585 return primaryCmp(line);
588 int TextLine::cmpXY(const void *p1, const void *p2) {
589 TextLine *line1 = *(TextLine **)p1;
590 TextLine *line2 = *(TextLine **)p2;
593 if ((cmp = line1->primaryCmp(line2))) {
596 return line1->secondaryCmp(line2);
599 void TextLine::coalesce(UnicodeMap *uMap) {
600 TextWord *word0, *word1;
601 double space, delta, minSpace;
608 // compute the inter-word space threshold
609 if (words->len > 1 || words->next->len > 1) {
612 minSpace = words->primaryDelta(words->next);
613 for (word0 = words->next, word1 = word0->next;
614 word1 && minSpace > 0;
615 word0 = word1, word1 = word0->next) {
616 if (word1->len > 1) {
619 delta = word0->primaryDelta(word1);
620 if (delta < minSpace) {
626 space = maxCharSpacing * words->fontSize;
628 space = maxWideCharSpacingMul * minSpace;
635 if (word0->primaryDelta(word1) >= space) {
636 word0->spaceAfter = gTrue;
639 } else if (word0->font == word1->font &&
640 fabs(word0->fontSize - word1->fontSize) <
641 maxWordFontSizeDelta * words->fontSize &&
642 word1->charPos == word0->charPos + word0->charLen) {
644 word0->next = word1->next;
654 // build the line text
655 isUnicode = uMap ? uMap->isUnicode() : gFalse;
657 for (word1 = words; word1; word1 = word1->next) {
659 if (word1->spaceAfter) {
663 text = (Unicode *)gmalloc(len * sizeof(Unicode));
664 edge = (double *)gmalloc((len + 1) * sizeof(double));
666 for (word1 = words; word1; word1 = word1->next) {
667 for (j = 0; j < word1->len; ++j) {
668 text[i] = word1->text[j];
669 edge[i] = word1->edge[j];
672 edge[i] = word1->edge[word1->len];
673 if (word1->spaceAfter) {
674 text[i] = (Unicode)0x0020;
679 // compute convertedLen and set up the col array
680 col = (int *)gmalloc((len + 1) * sizeof(int));
682 for (i = 0; i < len; ++i) {
683 col[i] = convertedLen;
687 convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
690 col[len] = convertedLen;
692 // check for hyphen at end of line
693 //~ need to check for other chars used as hyphens
694 hyphenated = text[len - 1] == (Unicode)'-';
697 //------------------------------------------------------------------------
699 //------------------------------------------------------------------------
704 TextLine *line; // the line object
705 int start, len; // offset and length of this fragment
706 // (in Unicode chars)
707 double xMin, xMax; // bounding box coordinates
709 double base; // baseline virtual coordinate
710 int col; // first column
712 void init(TextLine *lineA, int startA, int lenA);
713 void computeCoords(GBool oneRot);
715 static int cmpYXPrimaryRot(const void *p1, const void *p2);
716 static int cmpYXLineRot(const void *p1, const void *p2);
717 static int cmpXYLineRot(const void *p1, const void *p2);
720 void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
724 col = line->col[start];
727 void TextLineFrag::computeCoords(GBool oneRot) {
729 double d0, d1, d2, d3, d4;
735 xMin = line->edge[start];
736 xMax = line->edge[start + len];
743 yMin = line->edge[start];
744 yMax = line->edge[start + len];
747 xMin = line->edge[start + len];
748 xMax = line->edge[start];
755 yMin = line->edge[start + len];
756 yMax = line->edge[start];
763 if (line->rot == 0 && line->blk->page->primaryRot == 0) {
765 xMin = line->edge[start];
766 xMax = line->edge[start + len];
774 d0 = line->edge[start];
775 d1 = line->edge[start + len];
776 d2 = d3 = d4 = 0; // make gcc happy
783 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
784 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
785 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
786 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
787 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
793 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
794 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
795 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
796 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
797 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
803 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
804 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
805 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
806 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
807 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
813 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
814 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
815 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
816 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
817 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
821 switch (line->blk->page->primaryRot) {
823 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
824 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
825 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
826 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
827 base = blk->yMin + base * (blk->yMax - blk->yMin);
830 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
831 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
832 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
833 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
834 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
837 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
838 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
839 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
840 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
841 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
844 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
845 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
846 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
847 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
848 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
856 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
857 TextLineFrag *frag1 = (TextLineFrag *)p1;
858 TextLineFrag *frag2 = (TextLineFrag *)p2;
861 cmp = 0; // make gcc happy
862 switch (frag1->line->blk->page->primaryRot) {
864 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
865 cmp = frag1->xMin - frag2->xMin;
869 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
870 cmp = frag1->yMin - frag2->yMin;
874 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
875 cmp = frag2->xMax - frag1->xMax;
879 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
880 cmp = frag2->yMax - frag1->yMax;
884 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
887 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
888 TextLineFrag *frag1 = (TextLineFrag *)p1;
889 TextLineFrag *frag2 = (TextLineFrag *)p2;
892 cmp = 0; // make gcc happy
893 switch (frag1->line->rot) {
895 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
896 cmp = frag1->xMin - frag2->xMin;
900 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
901 cmp = frag1->yMin - frag2->yMin;
905 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
906 cmp = frag2->xMax - frag1->xMax;
910 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
911 cmp = frag2->yMax - frag1->yMax;
915 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
918 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
919 TextLineFrag *frag1 = (TextLineFrag *)p1;
920 TextLineFrag *frag2 = (TextLineFrag *)p2;
923 cmp = 0; // make gcc happy
924 switch (frag1->line->rot) {
926 if ((cmp = frag1->xMin - frag2->xMin) == 0) {
927 cmp = frag1->yMin - frag2->yMin;
931 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
932 cmp = frag2->xMax - frag1->xMax;
936 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
937 cmp = frag2->yMin - frag1->yMin;
941 if ((cmp = frag2->yMax - frag1->yMax) == 0) {
942 cmp = frag1->xMax - frag2->xMax;
946 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
949 //------------------------------------------------------------------------
951 //------------------------------------------------------------------------
953 TextBlock::TextBlock(TextPage *pageA, int rotA) {
959 priMax = page->pageWidth;
960 pool = new TextPool();
967 TextBlock::~TextBlock() {
978 void TextBlock::addWord(TextWord *word) {
986 if (word->xMin < xMin) {
989 if (word->xMax > xMax) {
992 if (word->yMin < yMin) {
995 if (word->yMax > yMax) {
1001 void TextBlock::coalesce(UnicodeMap *uMap) {
1002 TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1003 TextLine *line, *line0, *line1;
1004 int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1005 int baseIdx, bestWordBaseIdx, idx0, idx1;
1006 double minBase, maxBase;
1007 double fontSize, delta, priDelta, secDelta;
1008 TextLine **lineArray;
1013 // discard duplicated text (fake boldface, drop shadows)
1014 for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1015 word0 = pool->getPool(idx0);
1017 priDelta = dupMaxPriDelta * word0->fontSize;
1018 secDelta = dupMaxSecDelta * word0->fontSize;
1019 if (rot == 0 || rot == 3) {
1020 maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1022 maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
1025 word1 = word2 = NULL; // make gcc happy
1026 for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1029 word2 = word0->next;
1032 word2 = pool->getPool(idx1);
1034 for (; word2; word1 = word2, word2 = word2->next) {
1035 if (word2->len == word0->len &&
1036 !memcmp(word2->text, word0->text,
1037 word0->len * sizeof(Unicode))) {
1041 found = fabs(word0->xMin - word2->xMin) < priDelta &&
1042 fabs(word0->xMax - word2->xMax) < priDelta &&
1043 fabs(word0->yMin - word2->yMin) < secDelta &&
1044 fabs(word0->yMax - word2->yMax) < secDelta;
1048 found = fabs(word0->xMin - word2->xMin) < secDelta &&
1049 fabs(word0->xMax - word2->xMax) < secDelta &&
1050 fabs(word0->yMin - word2->yMin) < priDelta &&
1051 fabs(word0->yMax - word2->yMax) < priDelta;
1065 word1->next = word2->next;
1067 pool->setPool(idx1, word2->next);
1071 word0 = word0->next;
1078 poolMinBaseIdx = pool->minBaseIdx;
1083 // find the first non-empty line in the pool
1085 poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1087 if (poolMinBaseIdx > pool->maxBaseIdx) {
1091 // look for the left-most word in the first four lines of the
1092 // pool -- this avoids starting with a superscript word
1093 startBaseIdx = poolMinBaseIdx;
1094 for (baseIdx = poolMinBaseIdx + 1;
1095 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1097 if (!pool->getPool(baseIdx)) {
1100 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1102 startBaseIdx = baseIdx;
1106 // create a new line
1107 word0 = pool->getPool(startBaseIdx);
1108 pool->setPool(startBaseIdx, word0->next);
1110 line = new TextLine(this, word0->rot, word0->base);
1111 line->addWord(word0);
1114 // compute the search range
1115 fontSize = word0->fontSize;
1116 minBase = word0->base - maxIntraLineDelta * fontSize;
1117 maxBase = word0->base + maxIntraLineDelta * fontSize;
1118 minBaseIdx = pool->getBaseIdx(minBase);
1119 maxBaseIdx = pool->getBaseIdx(maxBase);
1121 // find the rest of the words in this line
1124 // find the left-most word whose baseline is in the range for
1126 bestWordBaseIdx = 0;
1127 bestWord0 = bestWord1 = NULL;
1128 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
1129 for (word0 = NULL, word1 = pool->getPool(baseIdx);
1131 word0 = word1, word1 = word1->next) {
1132 if (word1->base >= minBase &&
1133 word1->base <= maxBase &&
1134 (delta = lastWord->primaryDelta(word1)) >=
1135 minCharSpacing * fontSize) {
1136 if (delta < maxWordSpacing * fontSize &&
1137 (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1138 bestWordBaseIdx = baseIdx;
1150 // remove it from the pool, and add it to the line
1152 bestWord0->next = bestWord1->next;
1154 pool->setPool(bestWordBaseIdx, bestWord1->next);
1156 bestWord1->next = NULL;
1157 line->addWord(bestWord1);
1158 lastWord = bestWord1;
1162 if (curLine && line->cmpYX(curLine) > 0) {
1164 line1 = curLine->next;
1170 line1 && line->cmpYX(line1) > 0;
1171 line0 = line1, line1 = line1->next) ;
1179 line->coalesce(uMap);
1180 charCount += line->len;
1184 // sort lines into xy order for column assignment
1185 lineArray = (TextLine **)gmalloc(nLines * sizeof(TextLine *));
1186 for (line = lines, i = 0; line; line = line->next, ++i) {
1187 lineArray[i] = line;
1189 qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1191 // column assignment
1193 for (i = 0; i < nLines; ++i) {
1194 line0 = lineArray[i];
1196 for (j = 0; j < i; ++j) {
1197 line1 = lineArray[j];
1198 if (line1->primaryDelta(line0) >= 0) {
1199 col2 = line1->col[line1->len] + 1;
1201 k = 0; // make gcc happy
1206 line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1212 line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1218 line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1224 line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1228 col2 = line1->col[k];
1234 for (k = 0; k <= line0->len; ++k) {
1235 line0->col[k] += col1;
1237 if (line0->col[line0->len] > nColumns) {
1238 nColumns = line0->col[line0->len];
1244 void TextBlock::updatePriMinMax(TextBlock *blk) {
1245 double newPriMin, newPriMax;
1246 GBool gotPriMin, gotPriMax;
1248 gotPriMin = gotPriMax = gFalse;
1249 newPriMin = newPriMax = 0; // make gcc happy
1250 switch (page->primaryRot) {
1253 if (blk->yMin < yMax && blk->yMax > yMin) {
1254 if (blk->xMin < xMin) {
1255 newPriMin = blk->xMax;
1258 if (blk->xMax > xMax) {
1259 newPriMax = blk->xMin;
1266 if (blk->xMin < xMax && blk->xMax > xMin) {
1267 if (blk->yMin < yMin) {
1268 newPriMin = blk->yMax;
1271 if (blk->yMax > yMax) {
1272 newPriMax = blk->yMin;
1279 if (newPriMin > xMin) {
1282 if (newPriMin > priMin) {
1287 if (newPriMax < xMax) {
1290 if (newPriMax < priMax) {
1296 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1297 TextBlock *blk1 = *(TextBlock **)p1;
1298 TextBlock *blk2 = *(TextBlock **)p2;
1301 cmp = 0; // make gcc happy
1302 switch (blk1->page->primaryRot) {
1304 if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1305 cmp = blk1->yMin - blk2->yMin;
1309 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1310 cmp = blk2->xMax - blk1->xMax;
1314 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1315 cmp = blk2->yMin - blk1->yMin;
1319 if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1320 cmp = blk1->xMax - blk2->xMax;
1324 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1327 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1328 TextBlock *blk1 = *(TextBlock **)p1;
1329 TextBlock *blk2 = *(TextBlock **)p2;
1332 cmp = 0; // make gcc happy
1333 switch (blk1->page->primaryRot) {
1335 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1336 cmp = blk1->xMin - blk2->xMin;
1340 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1341 cmp = blk1->yMin - blk2->yMin;
1345 if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1346 cmp = blk2->xMax - blk1->xMax;
1350 if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1351 cmp = blk2->yMax - blk1->yMax;
1355 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1358 int TextBlock::primaryCmp(TextBlock *blk) {
1361 cmp = 0; // make gcc happy
1364 cmp = xMin - blk->xMin;
1367 cmp = yMin - blk->yMin;
1370 cmp = blk->xMax - xMax;
1373 cmp = blk->yMax - yMax;
1376 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1379 double TextBlock::secondaryDelta(TextBlock *blk) {
1382 delta = 0; // make gcc happy
1385 delta = blk->yMin - yMax;
1388 delta = xMin - blk->xMax;
1391 delta = yMin - blk->yMax;
1394 delta = blk->xMin - xMax;
1400 GBool TextBlock::isBelow(TextBlock *blk) {
1403 below = gFalse; // make gcc happy
1404 switch (page->primaryRot) {
1406 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1410 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1414 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1418 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1426 //------------------------------------------------------------------------
1428 //------------------------------------------------------------------------
1430 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
1436 priMin = blk->priMin;
1437 priMax = blk->priMax;
1438 blocks = lastBlk = blk;
1442 TextFlow::~TextFlow() {
1447 blocks = blocks->next;
1452 void TextFlow::addBlock(TextBlock *blk) {
1454 lastBlk->next = blk;
1459 if (blk->xMin < xMin) {
1462 if (blk->xMax > xMax) {
1465 if (blk->yMin < yMin) {
1468 if (blk->yMax > yMax) {
1473 GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
1476 // lower blocks must use smaller fonts
1477 if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
1481 fits = gFalse; // make gcc happy
1482 switch (page->primaryRot) {
1484 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1487 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1490 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1493 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1499 #if TEXTOUT_WORD_LIST
1501 //------------------------------------------------------------------------
1503 //------------------------------------------------------------------------
1505 TextWordList::TextWordList(TextPage *text, GBool physLayout) {
1510 TextWord **wordArray;
1513 words = new GList();
1515 if (text->rawOrder) {
1516 for (word = text->rawWords; word; word = word->next) {
1517 words->append(word);
1520 } else if (physLayout) {
1521 // this is inefficient, but it's also the least useful of these
1524 for (flow = text->flows; flow; flow = flow->next) {
1525 for (blk = flow->blocks; blk; blk = blk->next) {
1526 for (line = blk->lines; line; line = line->next) {
1527 for (word = line->words; word; word = word->next) {
1533 wordArray = (TextWord **)gmalloc(nWords * sizeof(TextWord *));
1535 for (flow = text->flows; flow; flow = flow->next) {
1536 for (blk = flow->blocks; blk; blk = blk->next) {
1537 for (line = blk->lines; line; line = line->next) {
1538 for (word = line->words; word; word = word->next) {
1539 wordArray[i++] = word;
1544 qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
1545 for (i = 0; i < nWords; ++i) {
1546 words->append(wordArray[i]);
1551 for (flow = text->flows; flow; flow = flow->next) {
1552 for (blk = flow->blocks; blk; blk = blk->next) {
1553 for (line = blk->lines; line; line = line->next) {
1554 for (word = line->words; word; word = word->next) {
1555 words->append(word);
1563 TextWordList::~TextWordList() {
1567 int TextWordList::getLength() {
1568 return words->getLength();
1571 TextWord *TextWordList::get(int idx) {
1572 if (idx < 0 || idx >= words->getLength()) {
1575 return (TextWord *)words->get(idx);
1578 #endif // TEXTOUT_WORD_LIST
1580 //------------------------------------------------------------------------
1582 //------------------------------------------------------------------------
1584 TextPage::TextPage(GBool rawOrderA) {
1587 rawOrder = rawOrderA;
1595 for (rot = 0; rot < 4; ++rot) {
1596 pools[rot] = new TextPool();
1603 fonts = new GList();
1604 lastFindXMin = lastFindYMin = 0;
1605 haveLastFind = gFalse;
1608 TextPage::~TextPage() {
1613 for (rot = 0; rot < 4; ++rot) {
1620 void TextPage::startPage(GfxState *state) {
1623 pageWidth = state->getPageWidth();
1624 pageHeight = state->getPageHeight();
1626 pageWidth = pageHeight = 0;
1630 void TextPage::clear() {
1642 rawWords = rawWords->next;
1646 for (rot = 0; rot < 4; ++rot) {
1651 flows = flows->next;
1656 deleteGList(fonts, TextFontInfo);
1665 for (rot = 0; rot < 4; ++rot) {
1666 pools[rot] = new TextPool();
1673 fonts = new GList();
1676 void TextPage::updateFont(GfxState *state) {
1680 int code, mCode, letterCode, anyCode;
1684 // get the font info object
1686 for (i = 0; i < fonts->getLength(); ++i) {
1687 curFont = (TextFontInfo *)fonts->get(i);
1688 if (curFont->matches(state)) {
1694 curFont = new TextFontInfo(state);
1695 fonts->append(curFont);
1698 // adjust the font size
1699 gfxFont = state->getFont();
1700 curFontSize = state->getTransformedFontSize();
1701 if (gfxFont && gfxFont->getType() == fontType3) {
1702 // This is a hack which makes it possible to deal with some Type 3
1703 // fonts. The problem is that it's impossible to know what the
1704 // base coordinate system used in the font is without actually
1705 // rendering the font. This code tries to guess by looking at the
1706 // width of the character 'm' (which breaks if the font is a
1707 // subset that doesn't contain 'm').
1708 mCode = letterCode = anyCode = -1;
1709 for (code = 0; code < 256; ++code) {
1710 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1711 if (name && name[0] == 'm' && name[1] == '\0') {
1714 if (letterCode < 0 && name && name[1] == '\0' &&
1715 ((name[0] >= 'A' && name[0] <= 'Z') ||
1716 (name[0] >= 'a' && name[0] <= 'z'))) {
1719 if (anyCode < 0 && name &&
1720 ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
1725 (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
1726 // 0.6 is a generic average 'm' width -- yes, this is a hack
1727 curFontSize *= w / 0.6;
1728 } else if (letterCode >= 0 &&
1729 (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
1730 // even more of a hack: 0.5 is a generic letter width
1731 curFontSize *= w / 0.5;
1732 } else if (anyCode >= 0 &&
1733 (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
1734 // better than nothing: 0.5 is a generic character width
1735 curFontSize *= w / 0.5;
1737 fm = gfxFont->getFontMatrix();
1739 curFontSize *= fabs(fm[3] / fm[0]);
1744 void TextPage::beginWord(GfxState *state, double x0, double y0) {
1745 double *txtm, *ctm, *fontm;
1749 // This check is needed because Type 3 characters can contain
1750 // text-drawing operations (when TextPage is being used via
1751 // XOutputDev rather than TextOutputDev).
1757 // compute the rotation
1758 txtm = state->getTextMat();
1759 ctm = state->getCTM();
1760 m[0] = txtm[0] * ctm[0] + txtm[1] * ctm[2];
1761 m[1] = txtm[0] * ctm[1] + txtm[1] * ctm[3];
1762 m[2] = txtm[2] * ctm[0] + txtm[3] * ctm[2];
1763 m[3] = txtm[2] * ctm[1] + txtm[3] * ctm[3];
1764 if (state->getFont()->getType() == fontType3) {
1765 fontm = state->getFont()->getFontMatrix();
1766 m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
1767 m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
1768 m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
1769 m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
1775 if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
1776 rot = (m[3] < 0) ? 0 : 2;
1778 rot = (m[2] > 0) ? 1 : 3;
1781 curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
1784 void TextPage::addChar(GfxState *state, double x, double y,
1785 double dx, double dy,
1786 CharCode c, Unicode *u, int uLen) {
1787 double x1, y1, w1, h1, dx2, dy2, sp;
1790 // if the previous char was a space, addChar will have called
1791 // endWord, so we need to start a new word
1793 beginWord(state, x, y);
1796 // throw away chars that aren't inside the page bounds
1797 state->transform(x, y, &x1, &y1);
1798 if (x1 < 0 || x1 > pageWidth ||
1799 y1 < 0 || y1 > pageHeight) {
1803 // subtract char and word spacing from the dx,dy values
1804 sp = state->getCharSpace();
1805 if (c == (CharCode)0x20) {
1806 sp += state->getWordSpace();
1808 state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1811 state->transformDelta(dx, dy, &w1, &h1);
1813 // check the tiny chars limit
1814 if (!globalParams->getTextKeepTinyChars() &&
1815 fabs(w1) < 3 && fabs(h1) < 3) {
1816 if (++nTinyChars > 50000) {
1821 // break words at space character
1822 if (uLen == 1 && u[0] == (Unicode)0x20) {
1829 // large char spacing is sometimes used to move text around -- in
1830 // this case, break text into individual chars and let the coalesce
1831 // function deal with it later
1834 switch (curWord->rot) {
1835 case 0: sp = x1 - curWord->xMax; break;
1836 case 1: sp = y1 - curWord->yMax; break;
1837 case 2: sp = curWord->xMin - x1; break;
1838 case 3: sp = curWord->yMin - y1; break;
1840 if (sp > defaultSpaceWidth * curWord->fontSize) {
1842 beginWord(state, x, y);
1846 // page rotation and/or transform matrices can cause text to be
1847 // drawn in reverse order -- in this case, swap the begin/end
1848 // coordinates and break text into individual chars
1849 if ((curWord->rot == 0 && w1 < 0) ||
1850 (curWord->rot == 1 && h1 < 0) ||
1851 (curWord->rot == 2 && w1 > 0) ||
1852 (curWord->rot == 3 && h1 > 0)) {
1854 beginWord(state, x + dx, y + dy);
1861 // add the characters to the current word
1866 for (i = 0; i < uLen; ++i) {
1867 curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
1873 void TextPage::endWord() {
1874 // This check is needed because Type 3 characters can contain
1875 // text-drawing operations (when TextPage is being used via
1876 // XOutputDev rather than TextOutputDev).
1888 void TextPage::addWord(TextWord *word) {
1889 // throw away zero-length words -- they don't have valid xMin/xMax
1890 // values, and they're useless anyway
1891 if (word->len == 0) {
1898 rawLastWord->next = word;
1904 pools[word->rot]->addWord(word);
1908 void TextPage::coalesce(GBool physLayout) {
1911 TextWord *word0, *word1, *word2;
1913 TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;
1914 TextBlock **blkArray;
1915 TextFlow *flow, *lastFlow;
1916 int rot, poolMinBaseIdx, baseIdx, startBaseIdx;
1917 double minBase, maxBase, newMinBase, newMaxBase;
1918 double fontSize, colSpace, lineSpace, intraLineSpace, blkSpace;
1922 int firstBlkIdx, nBlocksLeft;
1932 uMap = globalParams->getTextEncoding();
1938 #if 0 // for debugging
1939 printf("*** initial words ***\n");
1940 for (rot = 0; rot < 4; ++rot) {
1942 for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1943 for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
1944 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f '",
1945 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
1946 word0->base, word0->fontSize);
1947 for (i = 0; i < word0->len; ++i) {
1948 fputc(word0->text[i] & 0xff, stdout);
1957 //----- assemble the blocks
1959 //~ add an outer loop for writing mode (vertical text)
1961 // build blocks for each rotation value
1962 for (rot = 0; rot < 4; ++rot) {
1964 poolMinBaseIdx = pool->minBaseIdx;
1967 // add blocks until no more words are left
1970 // find the first non-empty line in the pool
1972 poolMinBaseIdx <= pool->maxBaseIdx &&
1973 !pool->getPool(poolMinBaseIdx);
1975 if (poolMinBaseIdx > pool->maxBaseIdx) {
1979 // look for the left-most word in the first four lines of the
1980 // pool -- this avoids starting with a superscript word
1981 startBaseIdx = poolMinBaseIdx;
1982 for (baseIdx = poolMinBaseIdx + 1;
1983 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1985 if (!pool->getPool(baseIdx)) {
1988 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1990 startBaseIdx = baseIdx;
1994 // create a new block
1995 word0 = pool->getPool(startBaseIdx);
1996 pool->setPool(startBaseIdx, word0->next);
1998 blk = new TextBlock(this, rot);
1999 blk->addWord(word0);
2001 fontSize = word0->fontSize;
2002 minBase = maxBase = word0->base;
2003 colSpace = minColSpacing * fontSize;
2004 lineSpace = maxLineSpacingDelta * fontSize;
2005 intraLineSpace = maxIntraLineDelta * fontSize;
2007 // add words to the block
2011 // look for words on the line above the current top edge of
2013 newMinBase = minBase;
2014 for (baseIdx = pool->getBaseIdx(minBase);
2015 baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2018 word1 = pool->getPool(baseIdx);
2020 if (word1->base < minBase &&
2021 word1->base >= minBase - lineSpace &&
2022 ((rot == 0 || rot == 2)
2023 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2024 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2025 fabs(word1->fontSize - fontSize) <
2026 maxBlockFontSizeDelta1 * fontSize) {
2029 word0->next = word1->next;
2031 pool->setPool(baseIdx, word1->next);
2033 word1 = word1->next;
2035 blk->addWord(word2);
2037 newMinBase = word2->base;
2040 word1 = word1->next;
2044 minBase = newMinBase;
2046 // look for words on the line below the current bottom edge of
2048 newMaxBase = maxBase;
2049 for (baseIdx = pool->getBaseIdx(maxBase);
2050 baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
2053 word1 = pool->getPool(baseIdx);
2055 if (word1->base > maxBase &&
2056 word1->base <= maxBase + lineSpace &&
2057 ((rot == 0 || rot == 2)
2058 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2059 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2060 fabs(word1->fontSize - fontSize) <
2061 maxBlockFontSizeDelta1 * fontSize) {
2064 word0->next = word1->next;
2066 pool->setPool(baseIdx, word1->next);
2068 word1 = word1->next;
2070 blk->addWord(word2);
2072 newMaxBase = word2->base;
2075 word1 = word1->next;
2079 maxBase = newMaxBase;
2081 // look for words that are on lines already in the block, and
2082 // that overlap the block horizontally
2083 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2084 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2087 word1 = pool->getPool(baseIdx);
2089 if (word1->base >= minBase - intraLineSpace &&
2090 word1->base <= maxBase + intraLineSpace &&
2091 ((rot == 0 || rot == 2)
2092 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2093 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2094 fabs(word1->fontSize - fontSize) <
2095 maxBlockFontSizeDelta2 * fontSize) {
2098 word0->next = word1->next;
2100 pool->setPool(baseIdx, word1->next);
2102 word1 = word1->next;
2104 blk->addWord(word2);
2108 word1 = word1->next;
2113 // only check for outlying words (the next two chunks of code)
2114 // if we didn't find anything else
2119 // scan down the left side of the block, looking for words
2120 // that are near (but not overlapping) the block; if there are
2121 // three or fewer, add them to the block
2123 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2124 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2126 word1 = pool->getPool(baseIdx);
2128 if (word1->base >= minBase - intraLineSpace &&
2129 word1->base <= maxBase + intraLineSpace &&
2130 ((rot == 0 || rot == 2)
2131 ? (word1->xMax <= blk->xMin &&
2132 word1->xMax > blk->xMin - colSpace)
2133 : (word1->yMax <= blk->yMin &&
2134 word1->yMax > blk->yMin - colSpace)) &&
2135 fabs(word1->fontSize - fontSize) <
2136 maxBlockFontSizeDelta3 * fontSize) {
2140 word1 = word1->next;
2143 if (n > 0 && n <= 3) {
2144 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2145 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2148 word1 = pool->getPool(baseIdx);
2150 if (word1->base >= minBase - intraLineSpace &&
2151 word1->base <= maxBase + intraLineSpace &&
2152 ((rot == 0 || rot == 2)
2153 ? (word1->xMax <= blk->xMin &&
2154 word1->xMax > blk->xMin - colSpace)
2155 : (word1->yMax <= blk->yMin &&
2156 word1->yMax > blk->yMin - colSpace)) &&
2157 fabs(word1->fontSize - fontSize) <
2158 maxBlockFontSizeDelta3 * fontSize) {
2161 word0->next = word1->next;
2163 pool->setPool(baseIdx, word1->next);
2165 word1 = word1->next;
2167 blk->addWord(word2);
2168 if (word2->base < minBase) {
2169 minBase = word2->base;
2170 } else if (word2->base > maxBase) {
2171 maxBase = word2->base;
2177 word1 = word1->next;
2183 // scan down the right side of the block, looking for words
2184 // that are near (but not overlapping) the block; if there are
2185 // three or fewer, add them to the block
2187 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2188 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2190 word1 = pool->getPool(baseIdx);
2192 if (word1->base >= minBase - intraLineSpace &&
2193 word1->base <= maxBase + intraLineSpace &&
2194 ((rot == 0 || rot == 2)
2195 ? (word1->xMin >= blk->xMax &&
2196 word1->xMin < blk->xMax + colSpace)
2197 : (word1->yMin >= blk->yMax &&
2198 word1->yMin < blk->yMax + colSpace)) &&
2199 fabs(word1->fontSize - fontSize) <
2200 maxBlockFontSizeDelta3 * fontSize) {
2204 word1 = word1->next;
2207 if (n > 0 && n <= 3) {
2208 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2209 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2212 word1 = pool->getPool(baseIdx);
2214 if (word1->base >= minBase - intraLineSpace &&
2215 word1->base <= maxBase + intraLineSpace &&
2216 ((rot == 0 || rot == 2)
2217 ? (word1->xMin >= blk->xMax &&
2218 word1->xMin < blk->xMax + colSpace)
2219 : (word1->yMin >= blk->yMax &&
2220 word1->yMin < blk->yMax + colSpace)) &&
2221 fabs(word1->fontSize - fontSize) <
2222 maxBlockFontSizeDelta3 * fontSize) {
2225 word0->next = word1->next;
2227 pool->setPool(baseIdx, word1->next);
2229 word1 = word1->next;
2231 blk->addWord(word2);
2232 if (word2->base < minBase) {
2233 minBase = word2->base;
2234 } else if (word2->base > maxBase) {
2235 maxBase = word2->base;
2241 word1 = word1->next;
2249 //~ need to compute the primary writing mode (horiz/vert) in
2250 //~ addition to primary rotation
2252 // coalesce the block, and add it to the list
2253 blk->coalesce(uMap);
2255 lastBlk->next = blk;
2260 count[rot] += blk->charCount;
2261 if (primaryRot < 0 || count[rot] > count[primaryRot]) {
2268 #if 0 // for debugging
2269 printf("*** rotation ***\n");
2270 for (rot = 0; rot < 4; ++rot) {
2271 printf(" %d: %6d\n", rot, count[rot]);
2273 printf(" primary rot = %d\n", primaryRot);
2277 #if 0 // for debugging
2278 printf("*** blocks ***\n");
2279 for (blk = blkList; blk; blk = blk->next) {
2280 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2281 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
2282 for (line = blk->lines; line; line = line->next) {
2283 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2284 line->xMin, line->xMax, line->yMin, line->yMax, line->base);
2285 for (word0 = line->words; word0; word0 = word0->next) {
2286 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2287 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2288 word0->base, word0->fontSize, word0->spaceAfter);
2289 for (i = 0; i < word0->len; ++i) {
2290 fputc(word0->text[i] & 0xff, stdout);
2299 // determine the primary direction
2301 for (blk = blkList; blk; blk = blk->next) {
2302 for (line = blk->lines; line; line = line->next) {
2303 for (word0 = line->words; word0; word0 = word0->next) {
2304 for (i = 0; i < word0->len; ++i) {
2305 if (unicodeTypeL(word0->text[i])) {
2307 } else if (unicodeTypeR(word0->text[i])) {
2314 primaryLR = lrCount >= 0;
2316 #if 0 // for debugging
2317 printf("*** direction ***\n");
2318 printf("lrCount = %d\n", lrCount);
2319 printf("primaryLR = %d\n", primaryLR);
2322 //----- column assignment
2324 // sort blocks into xy order for column assignment
2325 blocks = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *));
2326 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
2329 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
2331 // column assignment
2332 for (i = 0; i < nBlocks; ++i) {
2335 for (j = 0; j < i; ++j) {
2337 col2 = 0; // make gcc happy
2338 switch (primaryRot) {
2340 if (blk0->xMin > blk1->xMax) {
2341 col2 = blk1->col + blk1->nColumns + 3;
2343 col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
2344 (blk1->xMax - blk1->xMin)) *
2349 if (blk0->yMin > blk1->yMax) {
2350 col2 = blk1->col + blk1->nColumns + 3;
2352 col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
2353 (blk1->yMax - blk1->yMin)) *
2358 if (blk0->xMax < blk1->xMin) {
2359 col2 = blk1->col + blk1->nColumns + 3;
2361 col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
2362 (blk1->xMin - blk1->xMax)) *
2367 if (blk0->yMax < blk1->yMin) {
2368 col2 = blk1->col + blk1->nColumns + 3;
2370 col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
2371 (blk1->yMin - blk1->yMax)) *
2381 for (line = blk0->lines; line; line = line->next) {
2382 for (j = 0; j <= line->len; ++j) {
2383 line->col[j] += col1;
2388 #if 0 // for debugging
2389 printf("*** blocks, after column assignment ***\n");
2390 for (blk = blkList; blk; blk = blk->next) {
2391 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2392 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
2394 for (line = blk->lines; line; line = line->next) {
2396 for (word0 = line->words; word0; word0 = word0->next) {
2397 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2398 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2399 word0->base, word0->fontSize, word0->spaceAfter);
2400 for (i = 0; i < word0->len; ++i) {
2401 fputc(word0->text[i] & 0xff, stdout);
2410 //----- reading order sort
2412 // sort blocks into yx order (in preparation for reading order sort)
2413 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot);
2415 // compute space on left and right sides of each block
2416 for (i = 0; i < nBlocks; ++i) {
2418 for (j = 0; j < nBlocks; ++j) {
2421 blk0->updatePriMinMax(blk1);
2426 #if 0 // for debugging
2427 printf("*** blocks, after yx sort ***\n");
2428 for (i = 0; i < nBlocks; ++i) {
2430 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
2431 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2432 blk->priMin, blk->priMax);
2433 for (line = blk->lines; line; line = line->next) {
2435 for (word0 = line->words; word0; word0 = word0->next) {
2436 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2437 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2438 word0->base, word0->fontSize, word0->spaceAfter);
2439 for (j = 0; j < word0->len; ++j) {
2440 fputc(word0->text[j] & 0xff, stdout);
2450 //~ this needs to be adjusted for writing mode (vertical text)
2451 //~ this also needs to account for right-to-left column ordering
2452 blkArray = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *));
2453 memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *));
2454 flows = lastFlow = NULL;
2456 nBlocksLeft = nBlocks;
2457 while (nBlocksLeft > 0) {
2459 // find the upper-left-most block
2460 for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ;
2463 for (j = firstBlkIdx + 1; j < nBlocks; ++j) {
2466 if (blk && blk->secondaryDelta(blk1) > 0) {
2469 if (blk1->primaryCmp(blk) < 0) {
2479 // create a new flow, starting with the upper-left-most block
2480 flow = new TextFlow(this, blk);
2482 lastFlow->next = flow;
2487 fontSize = blk->lines->words->fontSize;
2489 // push the upper-left-most block on the stack
2490 blk->stackNext = NULL;
2493 // find the other blocks in this flow
2496 // find the upper-left-most block under (but within
2497 // maxBlockSpacing of) the top block on the stack
2498 blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize;
2501 for (j = firstBlkIdx; j < nBlocks; ++j) {
2504 if (blkStack->secondaryDelta(blk1) > blkSpace) {
2507 if (blk && blk->secondaryDelta(blk1) > 0) {
2510 if (blk1->isBelow(blkStack) &&
2511 (!blk || blk1->primaryCmp(blk) < 0)) {
2518 // if a suitable block was found, add it to the flow and push it
2520 if (blk && flow->blockFits(blk, blkStack)) {
2524 flow->addBlock(blk);
2525 fontSize = blk->lines->words->fontSize;
2526 blk->stackNext = blkStack;
2529 // otherwise (if there is no block under the top block or the
2530 // block is not suitable), pop the stack
2532 blkStack = blkStack->stackNext;
2538 #if 0 // for debugging
2539 printf("*** flows ***\n");
2540 for (flow = flows; flow; flow = flow->next) {
2541 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
2542 flow->xMin, flow->xMax, flow->yMin, flow->yMax,
2543 flow->priMin, flow->priMax);
2544 for (blk = flow->blocks; blk; blk = blk->next) {
2545 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
2546 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2547 blk->priMin, blk->priMax);
2548 for (line = blk->lines; line; line = line->next) {
2550 for (word0 = line->words; word0; word0 = word0->next) {
2551 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2552 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2553 word0->base, word0->fontSize, word0->spaceAfter);
2554 for (i = 0; i < word0->len; ++i) {
2555 fputc(word0->text[i] & 0xff, stdout);
2570 GBool TextPage::findText(Unicode *s, int len,
2571 GBool startAtTop, GBool stopAtBottom,
2572 GBool startAtLast, GBool stopAtLast,
2573 double *xMin, double *yMin,
2574 double *xMax, double *yMax) {
2580 double xStart, yStart, xStop, yStop;
2581 double xMin0, yMin0, xMax0, yMax0;
2582 double xMin1, yMin1, xMax1, yMax1;
2585 //~ needs to handle right-to-left text
2591 xStart = yStart = xStop = yStop = 0;
2592 if (startAtLast && haveLastFind) {
2593 xStart = lastFindXMin;
2594 yStart = lastFindYMin;
2595 } else if (!startAtTop) {
2599 if (stopAtLast && haveLastFind) {
2600 xStop = lastFindXMin;
2601 yStop = lastFindYMin;
2602 } else if (!stopAtBottom) {
2608 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
2609 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
2611 for (i = 0; i < nBlocks; ++i) {
2614 // check: is the block above the top limit?
2615 if (!startAtTop && blk->yMax < yStart) {
2619 // check: is the block below the bottom limit?
2620 if (!stopAtBottom && blk->yMin > yStop) {
2624 for (line = blk->lines; line; line = line->next) {
2626 // check: is the line above the top limit?
2627 if (!startAtTop && line->yMin < yStart) {
2631 // check: is the line below the bottom limit?
2632 if (!stopAtBottom && line->yMin > yStop) {
2636 // search each position in this line
2638 for (j = 0, p = line->text; j <= m - len; ++j, ++p) {
2640 // compare the strings
2641 for (k = 0; k < len; ++k) {
2642 #if 1 //~ this lowercases Latin A-Z only -- this will eventually be
2643 //~ extended to handle other character sets
2644 if (p[k] >= 0x41 && p[k] <= 0x5a) {
2649 if (s[k] >= 0x41 && s[k] <= 0x5a) {
2662 switch (line->rot) {
2664 xMin1 = line->edge[j];
2665 xMax1 = line->edge[j + len];
2672 yMin1 = line->edge[j];
2673 yMax1 = line->edge[j + len];
2676 xMin1 = line->edge[j + len];
2677 xMax1 = line->edge[j];
2684 yMin1 = line->edge[j + len];
2685 yMax1 = line->edge[j];
2689 yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
2691 yMin1 < yStop || (yMin1 == yStop && xMin1 < yStop))) {
2692 if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
2710 lastFindXMin = xMin0;
2711 lastFindYMin = yMin0;
2712 haveLastFind = gTrue;
2719 GString *TextPage::getText(double xMin, double yMin,
2720 double xMax, double yMax) {
2726 TextLineFrag *frags;
2727 int nFrags, fragsSize;
2729 char space[8], eol[16];
2730 int spaceLen, eolLen;
2733 int col, idx0, idx1, i, j;
2734 GBool multiLine, oneRot;
2742 // get the output encoding
2743 if (!(uMap = globalParams->getTextEncoding())) {
2746 isUnicode = uMap->isUnicode();
2747 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
2748 eolLen = 0; // make gcc happy
2749 switch (globalParams->getTextEOL()) {
2751 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
2754 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
2755 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
2758 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
2762 //~ writing mode (horiz/vert)
2764 // collect the line fragments that are in the rectangle
2766 frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag));
2770 for (i = 0; i < nBlocks; ++i) {
2772 if (xMin < blk->xMax && blk->xMin < xMax &&
2773 yMin < blk->yMax && blk->yMin < yMax) {
2774 for (line = blk->lines; line; line = line->next) {
2775 if (xMin < line->xMax && line->xMin < xMax &&
2776 yMin < line->yMax && line->yMin < yMax) {
2778 switch (line->rot) {
2780 y = 0.5 * (line->yMin + line->yMax);
2781 if (yMin < y && y < yMax) {
2783 while (j < line->len) {
2784 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
2792 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
2801 x = 0.5 * (line->xMin + line->xMax);
2802 if (xMin < x && x < xMax) {
2804 while (j < line->len) {
2805 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
2813 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
2822 y = 0.5 * (line->yMin + line->yMax);
2823 if (yMin < y && y < yMax) {
2825 while (j < line->len) {
2826 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
2834 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
2843 x = 0.5 * (line->xMin + line->xMax);
2844 if (xMin < x && x < xMax) {
2846 while (j < line->len) {
2847 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
2855 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
2864 if (idx0 >= 0 && idx1 >= 0) {
2865 if (nFrags == fragsSize) {
2867 frags = (TextLineFrag *)
2868 grealloc(frags, fragsSize * sizeof(TextLineFrag));
2870 frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
2872 if (lastRot >= 0 && line->rot != lastRot) {
2875 lastRot = line->rot;
2882 // sort the fragments and generate the string
2885 for (i = 0; i < nFrags; ++i) {
2886 frags[i].computeCoords(oneRot);
2888 assignColumns(frags, nFrags, oneRot);
2890 // if all lines in the region have the same rotation, use it;
2891 // otherwise, use the page's primary rotation
2893 qsort(frags, nFrags, sizeof(TextLineFrag),
2894 &TextLineFrag::cmpYXLineRot);
2896 qsort(frags, nFrags, sizeof(TextLineFrag),
2897 &TextLineFrag::cmpYXPrimaryRot);
2902 for (i = 0; i < nFrags; ++i) {
2906 if (frag->col < col ||
2907 (i > 0 && fabs(frag->base - frags[i-1].base) >
2908 maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
2909 s->append(eol, eolLen);
2915 for (; col < frag->col; ++col) {
2916 s->append(space, spaceLen);
2919 // get the fragment text
2920 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
2924 s->append(eol, eolLen);
2934 GBool TextPage::findCharRange(int pos, int length,
2935 double *xMin, double *yMin,
2936 double *xMax, double *yMax) {
2940 double xMin0, xMax0, yMin0, yMax0;
2941 double xMin1, xMax1, yMin1, yMax1;
2949 //~ this doesn't correctly handle:
2950 //~ - ranges split across multiple lines (the highlighted region
2951 //~ is the bounding box of all the parts of the range)
2952 //~ - cases where characters don't convert one-to-one into Unicode
2954 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
2955 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
2956 for (i = 0; i < nBlocks; ++i) {
2958 for (line = blk->lines; line; line = line->next) {
2959 for (word = line->words; word; word = word->next) {
2960 if (pos < word->charPos + word->charLen &&
2961 word->charPos < pos + length) {
2962 j0 = pos - word->charPos;
2966 j1 = pos + length - 1 - word->charPos;
2967 if (j1 >= word->len) {
2970 switch (line->rot) {
2972 xMin1 = word->edge[j0];
2973 xMax1 = word->edge[j1 + 1];
2980 yMin1 = word->edge[j0];
2981 yMax1 = word->edge[j1 + 1];
2984 xMin1 = word->edge[j1 + 1];
2985 xMax1 = word->edge[j0];
2992 yMin1 = word->edge[j1 + 1];
2993 yMax1 = word->edge[j0];
2996 if (first || xMin1 < xMin0) {
2999 if (first || xMax1 > xMax0) {
3002 if (first || yMin1 < yMin0) {
3005 if (first || yMax1 > yMax0) {
3023 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
3029 TextLineFrag *frags;
3031 int nFrags, fragsSize;
3033 char space[8], eol[16], eop[8];
3034 int spaceLen, eolLen, eopLen;
3039 // get the output encoding
3040 if (!(uMap = globalParams->getTextEncoding())) {
3043 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3044 eolLen = 0; // make gcc happy
3045 switch (globalParams->getTextEOL()) {
3047 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3050 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3051 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3054 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3057 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
3058 pageBreaks = globalParams->getTextPageBreaks();
3060 //~ writing mode (horiz/vert)
3062 // output the page in raw (content stream) order
3065 for (word = rawWords; word; word = word->next) {
3067 dumpFragment(word->text, word->len, uMap, s);
3068 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3071 fabs(word->next->base - word->base) <
3072 maxIntraLineDelta * word->fontSize) {
3073 if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
3074 (*outputFunc)(outputStream, space, spaceLen);
3077 (*outputFunc)(outputStream, eol, eolLen);
3081 // output the page, maintaining the original physical layout
3082 } else if (physLayout) {
3084 // collect the line fragments for the page and sort them
3086 frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag));
3088 for (i = 0; i < nBlocks; ++i) {
3090 for (line = blk->lines; line; line = line->next) {
3091 if (nFrags == fragsSize) {
3093 frags = (TextLineFrag *)grealloc(frags,
3094 fragsSize * sizeof(TextLineFrag));
3096 frags[nFrags].init(line, 0, line->len);
3097 frags[nFrags].computeCoords(gTrue);
3101 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
3105 for (i = 0; i < nFrags; ++i) {
3109 for (; col < frag->col; ++col) {
3110 (*outputFunc)(outputStream, space, spaceLen);
3115 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3116 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3119 // print one or more returns if necessary
3120 if (i == nFrags - 1 ||
3121 frags[i+1].col < col ||
3122 fabs(frags[i+1].base - frag->base) >
3123 maxIntraLineDelta * frag->line->words->fontSize) {
3124 if (i < nFrags - 1) {
3125 d = (int)((frags[i+1].base - frag->base) /
3126 frag->line->words->fontSize);
3135 for (; d > 0; --d) {
3136 (*outputFunc)(outputStream, eol, eolLen);
3144 // output the page, "undoing" the layout
3146 for (flow = flows; flow; flow = flow->next) {
3147 for (blk = flow->blocks; blk; blk = blk->next) {
3148 for (line = blk->lines; line; line = line->next) {
3150 if (line->hyphenated && (line->next || blk->next)) {
3154 dumpFragment(line->text, n, uMap, s);
3155 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3157 if (!line->hyphenated) {
3159 (*outputFunc)(outputStream, space, spaceLen);
3160 } else if (blk->next) {
3161 //~ this is a bit of a kludge - we should really do a more
3162 //~ intelligent determination of paragraphs
3163 if (blk->next->lines->words->fontSize ==
3164 blk->lines->words->fontSize) {
3165 (*outputFunc)(outputStream, space, spaceLen);
3167 (*outputFunc)(outputStream, eol, eolLen);
3173 (*outputFunc)(outputStream, eol, eolLen);
3174 (*outputFunc)(outputStream, eol, eolLen);
3180 (*outputFunc)(outputStream, eop, eopLen);
3181 (*outputFunc)(outputStream, eol, eolLen);
3187 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
3188 TextLineFrag *frag0, *frag1;
3189 int rot, col1, col2, i, j, k;
3191 // all text in the region has the same rotation -- recompute the
3192 // column numbers based only on the text in the region
3194 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
3195 rot = frags[0].line->rot;
3196 for (i = 0; i < nFrags; ++i) {
3199 for (j = 0; j < i; ++j) {
3201 col2 = 0; // make gcc happy
3204 if (frag0->xMin >= frag1->xMax) {
3205 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3206 frag1->line->col[frag1->start]) + 1;
3208 for (k = frag1->start;
3209 k < frag1->start + frag1->len &&
3210 frag0->xMin >= 0.5 * (frag1->line->edge[k] +
3211 frag1->line->edge[k+1]);
3214 frag1->line->col[k] - frag1->line->col[frag1->start];
3218 if (frag0->yMin >= frag1->yMax) {
3219 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3220 frag1->line->col[frag1->start]) + 1;
3222 for (k = frag1->start;
3223 k < frag1->start + frag1->len &&
3224 frag0->yMin >= 0.5 * (frag1->line->edge[k] +
3225 frag1->line->edge[k+1]);
3228 frag1->line->col[k] - frag1->line->col[frag1->start];
3232 if (frag0->xMax <= frag1->xMin) {
3233 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3234 frag1->line->col[frag1->start]) + 1;
3236 for (k = frag1->start;
3237 k < frag1->start + frag1->len &&
3238 frag0->xMax <= 0.5 * (frag1->line->edge[k] +
3239 frag1->line->edge[k+1]);
3242 frag1->line->col[k] - frag1->line->col[frag1->start];
3246 if (frag0->yMax <= frag1->yMin) {
3247 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3248 frag1->line->col[frag1->start]) + 1;
3250 for (k = frag1->start;
3251 k < frag1->start + frag1->len &&
3252 frag0->yMax <= 0.5 * (frag1->line->edge[k] +
3253 frag1->line->edge[k+1]);
3256 frag1->line->col[k] - frag1->line->col[frag1->start];
3267 // the region includes text at different rotations -- use the
3268 // globally assigned column numbers, offset by the minimum column
3269 // number (i.e., shift everything over to column 0)
3271 col1 = frags[0].col;
3272 for (i = 1; i < nFrags; ++i) {
3273 if (frags[i].col < col1) {
3274 col1 = frags[i].col;
3277 for (i = 0; i < nFrags; ++i) {
3278 frags[i].col -= col1;
3283 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
3285 char lre[8], rle[8], popdf[8], buf[8];
3286 int lreLen, rleLen, popdfLen, n;
3291 if (uMap->isUnicode()) {
3293 lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
3294 rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
3295 popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
3301 // output a left-to-right section
3302 for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
3303 for (k = i; k < j; ++k) {
3304 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3309 // output a right-to-left section
3310 for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
3312 s->append(rle, rleLen);
3313 for (k = j - 1; k >= i; --k) {
3314 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3318 s->append(popdf, popdfLen);
3325 s->append(rle, rleLen);
3328 // output a right-to-left section
3329 for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
3330 for (k = i; k > j; --k) {
3331 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3336 // output a left-to-right section
3337 for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
3339 s->append(lre, lreLen);
3340 for (k = j + 1; k <= i; ++k) {
3341 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3345 s->append(popdf, popdfLen);
3349 s->append(popdf, popdfLen);
3354 for (i = 0; i < len; ++i) {
3355 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
3364 #if TEXTOUT_WORD_LIST
3365 TextWordList *TextPage::makeWordList(GBool physLayout) {
3366 return new TextWordList(this, physLayout);
3370 //------------------------------------------------------------------------
3372 //------------------------------------------------------------------------
3374 static void outputToFile(void *stream, char *text, int len) {
3375 fwrite(text, 1, len, (FILE *)stream);
3378 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
3379 GBool rawOrderA, GBool append) {
3381 physLayout = physLayoutA;
3382 rawOrder = rawOrderA;
3388 if (!strcmp(fileName, "-")) {
3389 outputStream = stdout;
3391 // keep DOS from munging the end-of-line characters
3392 setmode(fileno(stdout), O_BINARY);
3394 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
3397 error(-1, "Couldn't open text file '%s'", fileName);
3401 outputFunc = &outputToFile;
3403 outputStream = NULL;
3406 // set up text object
3407 text = new TextPage(rawOrderA);
3410 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
3411 GBool physLayoutA, GBool rawOrderA) {
3413 outputStream = stream;
3415 physLayout = physLayoutA;
3416 rawOrder = rawOrderA;
3417 text = new TextPage(rawOrderA);
3421 TextOutputDev::~TextOutputDev() {
3424 ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
3426 fclose((FILE *)outputStream);
3433 void TextOutputDev::startPage(int pageNum, GfxState *state) {
3434 text->startPage(state);
3437 void TextOutputDev::endPage() {
3438 text->coalesce(physLayout);
3440 text->dump(outputStream, outputFunc, physLayout);
3444 void TextOutputDev::updateFont(GfxState *state) {
3445 text->updateFont(state);
3448 void TextOutputDev::beginString(GfxState *state, GString *s) {
3449 text->beginWord(state, state->getCurX(), state->getCurY());
3452 void TextOutputDev::endString(GfxState *state) {
3456 void TextOutputDev::drawChar(GfxState *state, double x, double y,
3457 double dx, double dy,
3458 double originX, double originY,
3459 CharCode c, Unicode *u, int uLen) {
3460 text->addChar(state, x, y, dx, dy, c, u, uLen);
3463 GBool TextOutputDev::findText(Unicode *s, int len,
3464 GBool startAtTop, GBool stopAtBottom,
3465 GBool startAtLast, GBool stopAtLast,
3466 double *xMin, double *yMin,
3467 double *xMax, double *yMax) {
3468 return text->findText(s, len, startAtTop, stopAtBottom,
3469 startAtLast, stopAtLast, xMin, yMin, xMax, yMax);
3472 GString *TextOutputDev::getText(double xMin, double yMin,
3473 double xMax, double yMax) {
3474 return text->getText(xMin, yMin, xMax, yMax);
3477 GBool TextOutputDev::findCharRange(int pos, int length,
3478 double *xMin, double *yMin,
3479 double *xMax, double *yMax) {
3480 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
3483 #if TEXTOUT_WORD_LIST
3484 TextWordList *TextOutputDev::makeWordList() {
3485 return text->makeWordList(physLayout);