1 //========================================================================
5 // Copyright 1997-2003 Glyph & Cog, LLC
7 //========================================================================
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
21 #include <fcntl.h> // for O_BINARY
22 #include <io.h> // for setmode
27 #include "xpdfconfig.h"
29 #include "GlobalParams.h"
30 #include "UnicodeMap.h"
31 #include "UnicodeTypeTable.h"
33 #include "TextOutputDev.h"
36 // needed for setting type/creator of MacOS files
37 #include "ICSupport.h"
40 //------------------------------------------------------------------------
42 //------------------------------------------------------------------------
44 // Each bucket in a text pool includes baselines within a range of
46 #define textPoolStep 4
48 // Inter-character space width which will cause addChar to start a new
50 #define minWordBreakSpace 0.1
52 // Negative inter-character space width, i.e., overlap, which will
53 // cause addChar to start a new word.
54 #define minDupBreakOverlap 0.2
56 // Max distance between baselines of two lines within a block, as a
57 // fraction of the font size.
58 #define maxLineSpacingDelta 1.5
60 // Max difference in primary font sizes on two lines in the same
61 // block. Delta1 is used when examining new lines above and below the
62 // current block; delta2 is used when examining text that overlaps the
63 // current block; delta3 is used when examining text to the left and
64 // right of the current block.
65 #define maxBlockFontSizeDelta1 0.05
66 #define maxBlockFontSizeDelta2 0.6
67 #define maxBlockFontSizeDelta3 0.2
69 // Max difference in font sizes inside a word.
70 #define maxWordFontSizeDelta 0.05
72 // Maximum distance between baselines of two words on the same line,
73 // e.g., distance between subscript or superscript and the primary
74 // baseline, as a fraction of the font size.
75 #define maxIntraLineDelta 0.5
77 // Minimum inter-word spacing, as a fraction of the font size. (Only
78 // used for raw ordering.)
79 #define minWordSpacing 0.15
81 // Maximum inter-word spacing, as a fraction of the font size.
82 #define maxWordSpacing 1.5
84 // Maximum horizontal spacing which will allow a word to be pulled
86 #define minColSpacing1 0.3
88 // Minimum spacing between columns, as a fraction of the font size.
89 #define minColSpacing2 1.0
91 // Maximum vertical spacing between blocks within a flow, as a
92 // multiple of the font size.
93 #define maxBlockSpacing 2.5
95 // Minimum spacing between characters within a word, as a fraction of
97 #define minCharSpacing -0.2
99 // Maximum spacing between characters within a word, as a fraction of
100 // the font size, when there is no obvious extra-wide character
102 #define maxCharSpacing 0.03
104 // When extra-wide character spacing is detected, the inter-character
105 // space threshold is set to the minimum inter-character space
106 // multiplied by this constant.
107 #define maxWideCharSpacingMul 1.3
109 // Max difference in primary,secondary coordinates (as a fraction of
110 // the font size) allowed for duplicated text (fake boldface, drop
111 // shadows) which is to be discarded.
112 #define dupMaxPriDelta 0.1
113 #define dupMaxSecDelta 0.2
115 //------------------------------------------------------------------------
117 //------------------------------------------------------------------------
119 TextFontInfo::TextFontInfo(GfxState *state) {
120 gfxFont = state->getFont();
121 #if TEXTOUT_WORD_LIST
122 fontName = (gfxFont && gfxFont->getOrigName())
123 ? gfxFont->getOrigName()->copy()
128 TextFontInfo::~TextFontInfo() {
129 #if TEXTOUT_WORD_LIST
136 GBool TextFontInfo::matches(GfxState *state) {
137 return state->getFont() == gfxFont;
140 //------------------------------------------------------------------------
142 //------------------------------------------------------------------------
144 TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
145 int charPosA, TextFontInfo *fontA, double fontSizeA) {
147 double x, y, ascent, descent;
153 fontSize = fontSizeA;
154 state->transform(x0, y0, &x, &y);
155 if ((gfxFont = font->gfxFont)) {
156 ascent = gfxFont->getAscent() * fontSize;
157 descent = gfxFont->getDescent() * fontSize;
159 // this means that the PDF file draws text without a current font,
160 // which should never happen
161 ascent = 0.95 * fontSize;
162 descent = -0.35 * fontSize;
169 // this is a sanity check for a case that shouldn't happen -- but
170 // if it does happen, we want to avoid dividing by zero later
180 // this is a sanity check for a case that shouldn't happen -- but
181 // if it does happen, we want to avoid dividing by zero later
191 // this is a sanity check for a case that shouldn't happen -- but
192 // if it does happen, we want to avoid dividing by zero later
202 // this is a sanity check for a case that shouldn't happen -- but
203 // if it does happen, we want to avoid dividing by zero later
216 #if TEXTOUT_WORD_LIST
219 if ((state->getRender() & 3) == 1) {
220 state->getStrokeRGB(&rgb);
222 state->getFillRGB(&rgb);
230 TextWord::~TextWord() {
235 void TextWord::addChar(GfxState *state, double x, double y,
236 double dx, double dy, Unicode u) {
239 text = (Unicode *)grealloc(text, size * sizeof(Unicode));
240 edge = (double *)grealloc(edge, (size + 1) * sizeof(double));
249 xMax = edge[len+1] = x + dx;
256 yMax = edge[len+1] = y + dy;
263 xMin = edge[len+1] = x + dx;
270 yMin = edge[len+1] = y + dy;
276 void TextWord::merge(TextWord *word) {
279 if (word->xMin < xMin) {
282 if (word->yMin < yMin) {
285 if (word->xMax > xMax) {
288 if (word->yMax > yMax) {
291 if (len + word->len > size) {
292 size = len + word->len;
293 text = (Unicode *)grealloc(text, size * sizeof(Unicode));
294 edge = (double *)grealloc(edge, (size + 1) * sizeof(double));
296 for (i = 0; i < word->len; ++i) {
297 text[len + i] = word->text[i];
298 edge[len + i] = word->edge[i];
300 edge[len + word->len] = word->edge[word->len];
302 charLen += word->charLen;
305 inline int TextWord::primaryCmp(TextWord *word) {
308 cmp = 0; // make gcc happy
311 cmp = xMin - word->xMin;
314 cmp = yMin - word->yMin;
317 cmp = word->xMax - xMax;
320 cmp = word->yMax - yMax;
323 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
326 double TextWord::primaryDelta(TextWord *word) {
329 delta = 0; // make gcc happy
332 delta = word->xMin - xMax;
335 delta = word->yMin - yMax;
338 delta = xMin - word->xMax;
341 delta = yMin - word->yMax;
347 int TextWord::cmpYX(const void *p1, const void *p2) {
348 TextWord *word1 = *(TextWord **)p1;
349 TextWord *word2 = *(TextWord **)p2;
352 cmp = word1->yMin - word2->yMin;
354 cmp = word1->xMin - word2->xMin;
356 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
359 #if TEXTOUT_WORD_LIST
361 GString *TextWord::getText() {
368 if (!(uMap = globalParams->getTextEncoding())) {
371 for (i = 0; i < len; ++i) {
372 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
379 #endif // TEXTOUT_WORD_LIST
381 //------------------------------------------------------------------------
383 //------------------------------------------------------------------------
385 TextPool::TextPool() {
393 TextPool::~TextPool() {
395 TextWord *word, *word2;
397 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
398 for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
406 int TextPool::getBaseIdx(double base) {
409 baseIdx = (int)(base / textPoolStep);
410 if (baseIdx < minBaseIdx) {
413 if (baseIdx > maxBaseIdx) {
419 void TextPool::addWord(TextWord *word) {
421 int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
424 // expand the array if needed
425 wordBaseIdx = (int)(word->base / textPoolStep);
426 if (minBaseIdx > maxBaseIdx) {
427 minBaseIdx = wordBaseIdx - 128;
428 maxBaseIdx = wordBaseIdx + 128;
429 pool = (TextWord **)gmalloc((maxBaseIdx - minBaseIdx + 1) *
431 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
432 pool[baseIdx - minBaseIdx] = NULL;
434 } else if (wordBaseIdx < minBaseIdx) {
435 newMinBaseIdx = wordBaseIdx - 128;
436 newPool = (TextWord **)gmalloc((maxBaseIdx - newMinBaseIdx + 1) *
438 for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
439 newPool[baseIdx - newMinBaseIdx] = NULL;
441 memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
442 (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
445 minBaseIdx = newMinBaseIdx;
446 } else if (wordBaseIdx > maxBaseIdx) {
447 newMaxBaseIdx = wordBaseIdx + 128;
448 pool = (TextWord **)grealloc(pool, (newMaxBaseIdx - minBaseIdx + 1) *
450 for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
451 pool[baseIdx - minBaseIdx] = NULL;
453 maxBaseIdx = newMaxBaseIdx;
456 // insert the new word
457 if (cursor && wordBaseIdx == cursorBaseIdx &&
458 word->primaryCmp(cursor) > 0) {
463 w1 = pool[wordBaseIdx - minBaseIdx];
465 for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
470 pool[wordBaseIdx - minBaseIdx] = word;
473 cursorBaseIdx = wordBaseIdx;
476 //------------------------------------------------------------------------
478 //------------------------------------------------------------------------
480 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
486 words = lastWord = NULL;
496 TextLine::~TextLine() {
509 void TextLine::addWord(TextWord *word) {
511 lastWord->next = word;
523 if (word->xMin < xMin) {
526 if (word->xMax > xMax) {
529 if (word->yMin < yMin) {
532 if (word->yMax > yMax) {
538 double TextLine::primaryDelta(TextLine *line) {
541 delta = 0; // make gcc happy
544 delta = line->xMin - xMax;
547 delta = line->yMin - yMax;
550 delta = xMin - line->xMax;
553 delta = yMin - line->yMax;
559 int TextLine::primaryCmp(TextLine *line) {
562 cmp = 0; // make gcc happy
565 cmp = xMin - line->xMin;
568 cmp = yMin - line->yMin;
571 cmp = line->xMax - xMax;
574 cmp = line->yMax - yMax;
577 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
580 int TextLine::secondaryCmp(TextLine *line) {
583 cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
584 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
587 int TextLine::cmpYX(TextLine *line) {
590 if ((cmp = secondaryCmp(line))) {
593 return primaryCmp(line);
596 int TextLine::cmpXY(const void *p1, const void *p2) {
597 TextLine *line1 = *(TextLine **)p1;
598 TextLine *line2 = *(TextLine **)p2;
601 if ((cmp = line1->primaryCmp(line2))) {
604 return line1->secondaryCmp(line2);
607 void TextLine::coalesce(UnicodeMap *uMap) {
608 TextWord *word0, *word1;
609 double space, delta, minSpace;
616 // compute the inter-word space threshold
617 if (words->len > 1 || words->next->len > 1) {
620 minSpace = words->primaryDelta(words->next);
621 for (word0 = words->next, word1 = word0->next;
622 word1 && minSpace > 0;
623 word0 = word1, word1 = word0->next) {
624 if (word1->len > 1) {
627 delta = word0->primaryDelta(word1);
628 if (delta < minSpace) {
634 space = maxCharSpacing * words->fontSize;
636 space = maxWideCharSpacingMul * minSpace;
643 if (word0->primaryDelta(word1) >= space) {
644 word0->spaceAfter = gTrue;
647 } else if (word0->font == word1->font &&
648 fabs(word0->fontSize - word1->fontSize) <
649 maxWordFontSizeDelta * words->fontSize &&
650 word1->charPos == word0->charPos + word0->charLen) {
652 word0->next = word1->next;
662 // build the line text
663 isUnicode = uMap ? uMap->isUnicode() : gFalse;
665 for (word1 = words; word1; word1 = word1->next) {
667 if (word1->spaceAfter) {
671 text = (Unicode *)gmalloc(len * sizeof(Unicode));
672 edge = (double *)gmalloc((len + 1) * sizeof(double));
674 for (word1 = words; word1; word1 = word1->next) {
675 for (j = 0; j < word1->len; ++j) {
676 text[i] = word1->text[j];
677 edge[i] = word1->edge[j];
680 edge[i] = word1->edge[word1->len];
681 if (word1->spaceAfter) {
682 text[i] = (Unicode)0x0020;
687 // compute convertedLen and set up the col array
688 col = (int *)gmalloc((len + 1) * sizeof(int));
690 for (i = 0; i < len; ++i) {
691 col[i] = convertedLen;
695 convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
698 col[len] = convertedLen;
700 // check for hyphen at end of line
701 //~ need to check for other chars used as hyphens
702 hyphenated = text[len - 1] == (Unicode)'-';
705 //------------------------------------------------------------------------
707 //------------------------------------------------------------------------
712 TextLine *line; // the line object
713 int start, len; // offset and length of this fragment
714 // (in Unicode chars)
715 double xMin, xMax; // bounding box coordinates
717 double base; // baseline virtual coordinate
718 int col; // first column
720 void init(TextLine *lineA, int startA, int lenA);
721 void computeCoords(GBool oneRot);
723 static int cmpYXPrimaryRot(const void *p1, const void *p2);
724 static int cmpYXLineRot(const void *p1, const void *p2);
725 static int cmpXYLineRot(const void *p1, const void *p2);
728 void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
732 col = line->col[start];
735 void TextLineFrag::computeCoords(GBool oneRot) {
737 double d0, d1, d2, d3, d4;
743 xMin = line->edge[start];
744 xMax = line->edge[start + len];
751 yMin = line->edge[start];
752 yMax = line->edge[start + len];
755 xMin = line->edge[start + len];
756 xMax = line->edge[start];
763 yMin = line->edge[start + len];
764 yMax = line->edge[start];
771 if (line->rot == 0 && line->blk->page->primaryRot == 0) {
773 xMin = line->edge[start];
774 xMax = line->edge[start + len];
782 d0 = line->edge[start];
783 d1 = line->edge[start + len];
784 d2 = d3 = d4 = 0; // make gcc happy
791 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
792 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
793 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
794 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
795 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
801 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
802 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
803 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
804 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
805 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
811 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
812 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
813 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
814 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
815 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
821 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
822 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
823 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
824 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
825 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
829 switch (line->blk->page->primaryRot) {
831 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
832 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
833 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
834 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
835 base = blk->yMin + base * (blk->yMax - blk->yMin);
838 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
839 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
840 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
841 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
842 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
845 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
846 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
847 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
848 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
849 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
852 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
853 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
854 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
855 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
856 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
864 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
865 TextLineFrag *frag1 = (TextLineFrag *)p1;
866 TextLineFrag *frag2 = (TextLineFrag *)p2;
869 cmp = 0; // make gcc happy
870 switch (frag1->line->blk->page->primaryRot) {
872 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
873 cmp = frag1->xMin - frag2->xMin;
877 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
878 cmp = frag1->yMin - frag2->yMin;
882 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
883 cmp = frag2->xMax - frag1->xMax;
887 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
888 cmp = frag2->yMax - frag1->yMax;
892 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
895 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
896 TextLineFrag *frag1 = (TextLineFrag *)p1;
897 TextLineFrag *frag2 = (TextLineFrag *)p2;
900 cmp = 0; // make gcc happy
901 switch (frag1->line->rot) {
903 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
904 cmp = frag1->xMin - frag2->xMin;
908 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
909 cmp = frag1->yMin - frag2->yMin;
913 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
914 cmp = frag2->xMax - frag1->xMax;
918 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
919 cmp = frag2->yMax - frag1->yMax;
923 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
926 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
927 TextLineFrag *frag1 = (TextLineFrag *)p1;
928 TextLineFrag *frag2 = (TextLineFrag *)p2;
931 cmp = 0; // make gcc happy
932 switch (frag1->line->rot) {
934 if ((cmp = frag1->xMin - frag2->xMin) == 0) {
935 cmp = frag1->yMin - frag2->yMin;
939 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
940 cmp = frag2->xMax - frag1->xMax;
944 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
945 cmp = frag2->yMin - frag1->yMin;
949 if ((cmp = frag2->yMax - frag1->yMax) == 0) {
950 cmp = frag1->xMax - frag2->xMax;
954 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
957 //------------------------------------------------------------------------
959 //------------------------------------------------------------------------
961 TextBlock::TextBlock(TextPage *pageA, int rotA) {
967 priMax = page->pageWidth;
968 pool = new TextPool();
975 TextBlock::~TextBlock() {
986 void TextBlock::addWord(TextWord *word) {
994 if (word->xMin < xMin) {
997 if (word->xMax > xMax) {
1000 if (word->yMin < yMin) {
1003 if (word->yMax > yMax) {
1009 void TextBlock::coalesce(UnicodeMap *uMap) {
1010 TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1011 TextLine *line, *line0, *line1;
1012 int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1013 int baseIdx, bestWordBaseIdx, idx0, idx1;
1014 double minBase, maxBase;
1015 double fontSize, delta, priDelta, secDelta;
1016 TextLine **lineArray;
1021 // discard duplicated text (fake boldface, drop shadows)
1022 for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1023 word0 = pool->getPool(idx0);
1025 priDelta = dupMaxPriDelta * word0->fontSize;
1026 secDelta = dupMaxSecDelta * word0->fontSize;
1027 if (rot == 0 || rot == 3) {
1028 maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1030 maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
1033 word1 = word2 = NULL; // make gcc happy
1034 for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1037 word2 = word0->next;
1040 word2 = pool->getPool(idx1);
1042 for (; word2; word1 = word2, word2 = word2->next) {
1043 if (word2->len == word0->len &&
1044 !memcmp(word2->text, word0->text,
1045 word0->len * sizeof(Unicode))) {
1049 found = fabs(word0->xMin - word2->xMin) < priDelta &&
1050 fabs(word0->xMax - word2->xMax) < priDelta &&
1051 fabs(word0->yMin - word2->yMin) < secDelta &&
1052 fabs(word0->yMax - word2->yMax) < secDelta;
1056 found = fabs(word0->xMin - word2->xMin) < secDelta &&
1057 fabs(word0->xMax - word2->xMax) < secDelta &&
1058 fabs(word0->yMin - word2->yMin) < priDelta &&
1059 fabs(word0->yMax - word2->yMax) < priDelta;
1073 word1->next = word2->next;
1075 pool->setPool(idx1, word2->next);
1079 word0 = word0->next;
1086 poolMinBaseIdx = pool->minBaseIdx;
1091 // find the first non-empty line in the pool
1093 poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1095 if (poolMinBaseIdx > pool->maxBaseIdx) {
1099 // look for the left-most word in the first four lines of the
1100 // pool -- this avoids starting with a superscript word
1101 startBaseIdx = poolMinBaseIdx;
1102 for (baseIdx = poolMinBaseIdx + 1;
1103 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1105 if (!pool->getPool(baseIdx)) {
1108 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1110 startBaseIdx = baseIdx;
1114 // create a new line
1115 word0 = pool->getPool(startBaseIdx);
1116 pool->setPool(startBaseIdx, word0->next);
1118 line = new TextLine(this, word0->rot, word0->base);
1119 line->addWord(word0);
1122 // compute the search range
1123 fontSize = word0->fontSize;
1124 minBase = word0->base - maxIntraLineDelta * fontSize;
1125 maxBase = word0->base + maxIntraLineDelta * fontSize;
1126 minBaseIdx = pool->getBaseIdx(minBase);
1127 maxBaseIdx = pool->getBaseIdx(maxBase);
1129 // find the rest of the words in this line
1132 // find the left-most word whose baseline is in the range for
1134 bestWordBaseIdx = 0;
1135 bestWord0 = bestWord1 = NULL;
1136 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
1137 for (word0 = NULL, word1 = pool->getPool(baseIdx);
1139 word0 = word1, word1 = word1->next) {
1140 if (word1->base >= minBase &&
1141 word1->base <= maxBase &&
1142 (delta = lastWord->primaryDelta(word1)) >=
1143 minCharSpacing * fontSize) {
1144 if (delta < maxWordSpacing * fontSize &&
1145 (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1146 bestWordBaseIdx = baseIdx;
1158 // remove it from the pool, and add it to the line
1160 bestWord0->next = bestWord1->next;
1162 pool->setPool(bestWordBaseIdx, bestWord1->next);
1164 bestWord1->next = NULL;
1165 line->addWord(bestWord1);
1166 lastWord = bestWord1;
1170 if (curLine && line->cmpYX(curLine) > 0) {
1172 line1 = curLine->next;
1178 line1 && line->cmpYX(line1) > 0;
1179 line0 = line1, line1 = line1->next) ;
1187 line->coalesce(uMap);
1188 charCount += line->len;
1192 // sort lines into xy order for column assignment
1193 lineArray = (TextLine **)gmalloc(nLines * sizeof(TextLine *));
1194 for (line = lines, i = 0; line; line = line->next, ++i) {
1195 lineArray[i] = line;
1197 qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1199 // column assignment
1201 for (i = 0; i < nLines; ++i) {
1202 line0 = lineArray[i];
1204 for (j = 0; j < i; ++j) {
1205 line1 = lineArray[j];
1206 if (line1->primaryDelta(line0) >= 0) {
1207 col2 = line1->col[line1->len] + 1;
1209 k = 0; // make gcc happy
1214 line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1220 line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1226 line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1232 line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1236 col2 = line1->col[k];
1242 for (k = 0; k <= line0->len; ++k) {
1243 line0->col[k] += col1;
1245 if (line0->col[line0->len] > nColumns) {
1246 nColumns = line0->col[line0->len];
1252 void TextBlock::updatePriMinMax(TextBlock *blk) {
1253 double newPriMin, newPriMax;
1254 GBool gotPriMin, gotPriMax;
1256 gotPriMin = gotPriMax = gFalse;
1257 newPriMin = newPriMax = 0; // make gcc happy
1258 switch (page->primaryRot) {
1261 if (blk->yMin < yMax && blk->yMax > yMin) {
1262 if (blk->xMin < xMin) {
1263 newPriMin = blk->xMax;
1266 if (blk->xMax > xMax) {
1267 newPriMax = blk->xMin;
1274 if (blk->xMin < xMax && blk->xMax > xMin) {
1275 if (blk->yMin < yMin) {
1276 newPriMin = blk->yMax;
1279 if (blk->yMax > yMax) {
1280 newPriMax = blk->yMin;
1287 if (newPriMin > xMin) {
1290 if (newPriMin > priMin) {
1295 if (newPriMax < xMax) {
1298 if (newPriMax < priMax) {
1304 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1305 TextBlock *blk1 = *(TextBlock **)p1;
1306 TextBlock *blk2 = *(TextBlock **)p2;
1309 cmp = 0; // make gcc happy
1310 switch (blk1->page->primaryRot) {
1312 if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1313 cmp = blk1->yMin - blk2->yMin;
1317 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1318 cmp = blk2->xMax - blk1->xMax;
1322 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1323 cmp = blk2->yMin - blk1->yMin;
1327 if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1328 cmp = blk1->xMax - blk2->xMax;
1332 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1335 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1336 TextBlock *blk1 = *(TextBlock **)p1;
1337 TextBlock *blk2 = *(TextBlock **)p2;
1340 cmp = 0; // make gcc happy
1341 switch (blk1->page->primaryRot) {
1343 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1344 cmp = blk1->xMin - blk2->xMin;
1348 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1349 cmp = blk1->yMin - blk2->yMin;
1353 if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1354 cmp = blk2->xMax - blk1->xMax;
1358 if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1359 cmp = blk2->yMax - blk1->yMax;
1363 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1366 int TextBlock::primaryCmp(TextBlock *blk) {
1369 cmp = 0; // make gcc happy
1372 cmp = xMin - blk->xMin;
1375 cmp = yMin - blk->yMin;
1378 cmp = blk->xMax - xMax;
1381 cmp = blk->yMax - yMax;
1384 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1387 double TextBlock::secondaryDelta(TextBlock *blk) {
1390 delta = 0; // make gcc happy
1393 delta = blk->yMin - yMax;
1396 delta = xMin - blk->xMax;
1399 delta = yMin - blk->yMax;
1402 delta = blk->xMin - xMax;
1408 GBool TextBlock::isBelow(TextBlock *blk) {
1411 below = gFalse; // make gcc happy
1412 switch (page->primaryRot) {
1414 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1418 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1422 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1426 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1434 //------------------------------------------------------------------------
1436 //------------------------------------------------------------------------
1438 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
1444 priMin = blk->priMin;
1445 priMax = blk->priMax;
1446 blocks = lastBlk = blk;
1450 TextFlow::~TextFlow() {
1455 blocks = blocks->next;
1460 void TextFlow::addBlock(TextBlock *blk) {
1462 lastBlk->next = blk;
1467 if (blk->xMin < xMin) {
1470 if (blk->xMax > xMax) {
1473 if (blk->yMin < yMin) {
1476 if (blk->yMax > yMax) {
1481 GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
1484 // lower blocks must use smaller fonts
1485 if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
1489 fits = gFalse; // make gcc happy
1490 switch (page->primaryRot) {
1492 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1495 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1498 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1501 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1507 #if TEXTOUT_WORD_LIST
1509 //------------------------------------------------------------------------
1511 //------------------------------------------------------------------------
1513 TextWordList::TextWordList(TextPage *text, GBool physLayout) {
1518 TextWord **wordArray;
1521 words = new GList();
1523 if (text->rawOrder) {
1524 for (word = text->rawWords; word; word = word->next) {
1525 words->append(word);
1528 } else if (physLayout) {
1529 // this is inefficient, but it's also the least useful of these
1532 for (flow = text->flows; flow; flow = flow->next) {
1533 for (blk = flow->blocks; blk; blk = blk->next) {
1534 for (line = blk->lines; line; line = line->next) {
1535 for (word = line->words; word; word = word->next) {
1541 wordArray = (TextWord **)gmalloc(nWords * sizeof(TextWord *));
1543 for (flow = text->flows; flow; flow = flow->next) {
1544 for (blk = flow->blocks; blk; blk = blk->next) {
1545 for (line = blk->lines; line; line = line->next) {
1546 for (word = line->words; word; word = word->next) {
1547 wordArray[i++] = word;
1552 qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
1553 for (i = 0; i < nWords; ++i) {
1554 words->append(wordArray[i]);
1559 for (flow = text->flows; flow; flow = flow->next) {
1560 for (blk = flow->blocks; blk; blk = blk->next) {
1561 for (line = blk->lines; line; line = line->next) {
1562 for (word = line->words; word; word = word->next) {
1563 words->append(word);
1571 TextWordList::~TextWordList() {
1575 int TextWordList::getLength() {
1576 return words->getLength();
1579 TextWord *TextWordList::get(int idx) {
1580 if (idx < 0 || idx >= words->getLength()) {
1583 return (TextWord *)words->get(idx);
1586 #endif // TEXTOUT_WORD_LIST
1588 //------------------------------------------------------------------------
1590 //------------------------------------------------------------------------
1592 TextPage::TextPage(GBool rawOrderA) {
1595 rawOrder = rawOrderA;
1602 lastCharOverlap = gFalse;
1604 for (rot = 0; rot < 4; ++rot) {
1605 pools[rot] = new TextPool();
1612 fonts = new GList();
1613 lastFindXMin = lastFindYMin = 0;
1614 haveLastFind = gFalse;
1617 TextPage::~TextPage() {
1622 for (rot = 0; rot < 4; ++rot) {
1629 void TextPage::startPage(GfxState *state) {
1632 pageWidth = state->getPageWidth();
1633 pageHeight = state->getPageHeight();
1635 pageWidth = pageHeight = 0;
1639 void TextPage::endPage() {
1645 void TextPage::clear() {
1657 rawWords = rawWords->next;
1661 for (rot = 0; rot < 4; ++rot) {
1666 flows = flows->next;
1671 deleteGList(fonts, TextFontInfo);
1680 for (rot = 0; rot < 4; ++rot) {
1681 pools[rot] = new TextPool();
1688 fonts = new GList();
1691 void TextPage::updateFont(GfxState *state) {
1695 int code, mCode, letterCode, anyCode;
1699 // get the font info object
1701 for (i = 0; i < fonts->getLength(); ++i) {
1702 curFont = (TextFontInfo *)fonts->get(i);
1703 if (curFont->matches(state)) {
1709 curFont = new TextFontInfo(state);
1710 fonts->append(curFont);
1713 // adjust the font size
1714 gfxFont = state->getFont();
1715 curFontSize = state->getTransformedFontSize();
1716 if (gfxFont && gfxFont->getType() == fontType3) {
1717 // This is a hack which makes it possible to deal with some Type 3
1718 // fonts. The problem is that it's impossible to know what the
1719 // base coordinate system used in the font is without actually
1720 // rendering the font. This code tries to guess by looking at the
1721 // width of the character 'm' (which breaks if the font is a
1722 // subset that doesn't contain 'm').
1723 mCode = letterCode = anyCode = -1;
1724 for (code = 0; code < 256; ++code) {
1725 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1726 if (name && name[0] == 'm' && name[1] == '\0') {
1729 if (letterCode < 0 && name && name[1] == '\0' &&
1730 ((name[0] >= 'A' && name[0] <= 'Z') ||
1731 (name[0] >= 'a' && name[0] <= 'z'))) {
1734 if (anyCode < 0 && name &&
1735 ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
1740 (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
1741 // 0.6 is a generic average 'm' width -- yes, this is a hack
1742 curFontSize *= w / 0.6;
1743 } else if (letterCode >= 0 &&
1744 (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
1745 // even more of a hack: 0.5 is a generic letter width
1746 curFontSize *= w / 0.5;
1747 } else if (anyCode >= 0 &&
1748 (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
1749 // better than nothing: 0.5 is a generic character width
1750 curFontSize *= w / 0.5;
1752 fm = gfxFont->getFontMatrix();
1754 curFontSize *= fabs(fm[3] / fm[0]);
1759 void TextPage::beginWord(GfxState *state, double x0, double y0) {
1760 double *txtm, *ctm, *fontm;
1764 // This check is needed because Type 3 characters can contain
1765 // text-drawing operations (when TextPage is being used via
1766 // {X,Win}SplashOutputDev rather than TextOutputDev).
1772 // compute the rotation
1773 txtm = state->getTextMat();
1774 ctm = state->getCTM();
1775 m[0] = txtm[0] * ctm[0] + txtm[1] * ctm[2];
1776 m[1] = txtm[0] * ctm[1] + txtm[1] * ctm[3];
1777 m[2] = txtm[2] * ctm[0] + txtm[3] * ctm[2];
1778 m[3] = txtm[2] * ctm[1] + txtm[3] * ctm[3];
1779 if (state->getFont()->getType() == fontType3) {
1780 fontm = state->getFont()->getFontMatrix();
1781 m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
1782 m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
1783 m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
1784 m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
1790 if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
1791 rot = (m[3] < 0) ? 0 : 2;
1793 rot = (m[2] > 0) ? 1 : 3;
1796 curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
1799 void TextPage::addChar(GfxState *state, double x, double y,
1800 double dx, double dy,
1801 CharCode c, Unicode *u, int uLen) {
1802 double x1, y1, w1, h1, dx2, dy2, base, sp;
1805 // if the previous char was a space, addChar will have called
1806 // endWord, so we need to start a new word
1808 beginWord(state, x, y);
1811 // throw away chars that aren't inside the page bounds
1812 state->transform(x, y, &x1, &y1);
1813 if (x1 < 0 || x1 > pageWidth ||
1814 y1 < 0 || y1 > pageHeight) {
1818 // subtract char and word spacing from the dx,dy values
1819 sp = state->getCharSpace();
1820 if (c == (CharCode)0x20) {
1821 sp += state->getWordSpace();
1823 state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1826 state->transformDelta(dx, dy, &w1, &h1);
1828 // check the tiny chars limit
1829 if (!globalParams->getTextKeepTinyChars() &&
1830 fabs(w1) < 3 && fabs(h1) < 3) {
1831 if (++nTinyChars > 50000) {
1836 // break words at space character
1837 if (uLen == 1 && u[0] == (Unicode)0x20) {
1844 // start a new word if:
1845 // (1) this character's baseline doesn't match the current word's
1847 // (2) there is space between the end of the current word and this
1849 // (3) this character overlaps the previous one (duplicated text), or
1850 // (4) the previous character was an overlap (we want each duplicated
1851 // characters to be in a word by itself)
1852 base = sp = 0; // make gcc happy
1853 if (curWord->len > 0) {
1854 switch (curWord->rot) {
1857 sp = x1 - curWord->xMax;
1861 sp = y1 - curWord->yMax;
1865 sp = curWord->xMin - x1;
1869 sp = curWord->yMin - y1;
1872 if (fabs(base - curWord->base) > 0.5 ||
1873 sp > minWordBreakSpace * curWord->fontSize ||
1874 sp < -minDupBreakOverlap * curWord->fontSize ||
1876 lastCharOverlap = gTrue;
1878 beginWord(state, x, y);
1880 lastCharOverlap = gFalse;
1883 lastCharOverlap = gFalse;
1886 // page rotation and/or transform matrices can cause text to be
1887 // drawn in reverse order -- in this case, swap the begin/end
1888 // coordinates and break text into individual chars
1889 if ((curWord->rot == 0 && w1 < 0) ||
1890 (curWord->rot == 1 && h1 < 0) ||
1891 (curWord->rot == 2 && w1 > 0) ||
1892 (curWord->rot == 3 && h1 > 0)) {
1894 beginWord(state, x + dx, y + dy);
1901 // add the characters to the current word
1906 for (i = 0; i < uLen; ++i) {
1907 curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
1913 void TextPage::endWord() {
1914 // This check is needed because Type 3 characters can contain
1915 // text-drawing operations (when TextPage is being used via
1916 // {X,Win}SplashOutputDev rather than TextOutputDev).
1928 void TextPage::addWord(TextWord *word) {
1929 // throw away zero-length words -- they don't have valid xMin/xMax
1930 // values, and they're useless anyway
1931 if (word->len == 0) {
1938 rawLastWord->next = word;
1944 pools[word->rot]->addWord(word);
1948 void TextPage::coalesce(GBool physLayout) {
1951 TextWord *word0, *word1, *word2;
1953 TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;
1954 TextBlock **blkArray;
1955 TextFlow *flow, *lastFlow;
1956 int rot, poolMinBaseIdx, baseIdx, startBaseIdx;
1957 double minBase, maxBase, newMinBase, newMaxBase;
1958 double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
1962 int firstBlkIdx, nBlocksLeft;
1972 uMap = globalParams->getTextEncoding();
1978 #if 0 // for debugging
1979 printf("*** initial words ***\n");
1980 for (rot = 0; rot < 4; ++rot) {
1982 for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1983 for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
1984 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f '",
1985 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
1986 word0->base, word0->fontSize);
1987 for (i = 0; i < word0->len; ++i) {
1988 fputc(word0->text[i] & 0xff, stdout);
1997 //----- assemble the blocks
1999 //~ add an outer loop for writing mode (vertical text)
2001 // build blocks for each rotation value
2002 for (rot = 0; rot < 4; ++rot) {
2004 poolMinBaseIdx = pool->minBaseIdx;
2007 // add blocks until no more words are left
2010 // find the first non-empty line in the pool
2012 poolMinBaseIdx <= pool->maxBaseIdx &&
2013 !pool->getPool(poolMinBaseIdx);
2015 if (poolMinBaseIdx > pool->maxBaseIdx) {
2019 // look for the left-most word in the first four lines of the
2020 // pool -- this avoids starting with a superscript word
2021 startBaseIdx = poolMinBaseIdx;
2022 for (baseIdx = poolMinBaseIdx + 1;
2023 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
2025 if (!pool->getPool(baseIdx)) {
2028 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
2030 startBaseIdx = baseIdx;
2034 // create a new block
2035 word0 = pool->getPool(startBaseIdx);
2036 pool->setPool(startBaseIdx, word0->next);
2038 blk = new TextBlock(this, rot);
2039 blk->addWord(word0);
2041 fontSize = word0->fontSize;
2042 minBase = maxBase = word0->base;
2043 colSpace1 = minColSpacing1 * fontSize;
2044 colSpace2 = minColSpacing2 * fontSize;
2045 lineSpace = maxLineSpacingDelta * fontSize;
2046 intraLineSpace = maxIntraLineDelta * fontSize;
2048 // add words to the block
2052 // look for words on the line above the current top edge of
2054 newMinBase = minBase;
2055 for (baseIdx = pool->getBaseIdx(minBase);
2056 baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2059 word1 = pool->getPool(baseIdx);
2061 if (word1->base < minBase &&
2062 word1->base >= minBase - lineSpace &&
2063 ((rot == 0 || rot == 2)
2064 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2065 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2066 fabs(word1->fontSize - fontSize) <
2067 maxBlockFontSizeDelta1 * fontSize) {
2070 word0->next = word1->next;
2072 pool->setPool(baseIdx, word1->next);
2074 word1 = word1->next;
2076 blk->addWord(word2);
2078 newMinBase = word2->base;
2081 word1 = word1->next;
2085 minBase = newMinBase;
2087 // look for words on the line below the current bottom edge of
2089 newMaxBase = maxBase;
2090 for (baseIdx = pool->getBaseIdx(maxBase);
2091 baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
2094 word1 = pool->getPool(baseIdx);
2096 if (word1->base > maxBase &&
2097 word1->base <= maxBase + lineSpace &&
2098 ((rot == 0 || rot == 2)
2099 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2100 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2101 fabs(word1->fontSize - fontSize) <
2102 maxBlockFontSizeDelta1 * fontSize) {
2105 word0->next = word1->next;
2107 pool->setPool(baseIdx, word1->next);
2109 word1 = word1->next;
2111 blk->addWord(word2);
2113 newMaxBase = word2->base;
2116 word1 = word1->next;
2120 maxBase = newMaxBase;
2122 // look for words that are on lines already in the block, and
2123 // that overlap the block horizontally
2124 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2125 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2128 word1 = pool->getPool(baseIdx);
2130 if (word1->base >= minBase - intraLineSpace &&
2131 word1->base <= maxBase + intraLineSpace &&
2132 ((rot == 0 || rot == 2)
2133 ? (word1->xMin < blk->xMax + colSpace1 &&
2134 word1->xMax > blk->xMin - colSpace1)
2135 : (word1->yMin < blk->yMax + colSpace1 &&
2136 word1->yMax > blk->yMin - colSpace1)) &&
2137 fabs(word1->fontSize - fontSize) <
2138 maxBlockFontSizeDelta2 * fontSize) {
2141 word0->next = word1->next;
2143 pool->setPool(baseIdx, word1->next);
2145 word1 = word1->next;
2147 blk->addWord(word2);
2151 word1 = word1->next;
2156 // only check for outlying words (the next two chunks of code)
2157 // if we didn't find anything else
2162 // scan down the left side of the block, looking for words
2163 // that are near (but not overlapping) the block; if there are
2164 // three or fewer, add them to the block
2166 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2167 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2169 word1 = pool->getPool(baseIdx);
2171 if (word1->base >= minBase - intraLineSpace &&
2172 word1->base <= maxBase + intraLineSpace &&
2173 ((rot == 0 || rot == 2)
2174 ? (word1->xMax <= blk->xMin &&
2175 word1->xMax > blk->xMin - colSpace2)
2176 : (word1->yMax <= blk->yMin &&
2177 word1->yMax > blk->yMin - colSpace2)) &&
2178 fabs(word1->fontSize - fontSize) <
2179 maxBlockFontSizeDelta3 * fontSize) {
2183 word1 = word1->next;
2186 if (n > 0 && n <= 3) {
2187 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2188 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2191 word1 = pool->getPool(baseIdx);
2193 if (word1->base >= minBase - intraLineSpace &&
2194 word1->base <= maxBase + intraLineSpace &&
2195 ((rot == 0 || rot == 2)
2196 ? (word1->xMax <= blk->xMin &&
2197 word1->xMax > blk->xMin - colSpace2)
2198 : (word1->yMax <= blk->yMin &&
2199 word1->yMax > blk->yMin - colSpace2)) &&
2200 fabs(word1->fontSize - fontSize) <
2201 maxBlockFontSizeDelta3 * fontSize) {
2204 word0->next = word1->next;
2206 pool->setPool(baseIdx, word1->next);
2208 word1 = word1->next;
2210 blk->addWord(word2);
2211 if (word2->base < minBase) {
2212 minBase = word2->base;
2213 } else if (word2->base > maxBase) {
2214 maxBase = word2->base;
2220 word1 = word1->next;
2226 // scan down the right side of the block, looking for words
2227 // that are near (but not overlapping) the block; if there are
2228 // three or fewer, add them to the block
2230 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2231 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2233 word1 = pool->getPool(baseIdx);
2235 if (word1->base >= minBase - intraLineSpace &&
2236 word1->base <= maxBase + intraLineSpace &&
2237 ((rot == 0 || rot == 2)
2238 ? (word1->xMin >= blk->xMax &&
2239 word1->xMin < blk->xMax + colSpace2)
2240 : (word1->yMin >= blk->yMax &&
2241 word1->yMin < blk->yMax + colSpace2)) &&
2242 fabs(word1->fontSize - fontSize) <
2243 maxBlockFontSizeDelta3 * fontSize) {
2247 word1 = word1->next;
2250 if (n > 0 && n <= 3) {
2251 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2252 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2255 word1 = pool->getPool(baseIdx);
2257 if (word1->base >= minBase - intraLineSpace &&
2258 word1->base <= maxBase + intraLineSpace &&
2259 ((rot == 0 || rot == 2)
2260 ? (word1->xMin >= blk->xMax &&
2261 word1->xMin < blk->xMax + colSpace2)
2262 : (word1->yMin >= blk->yMax &&
2263 word1->yMin < blk->yMax + colSpace2)) &&
2264 fabs(word1->fontSize - fontSize) <
2265 maxBlockFontSizeDelta3 * fontSize) {
2268 word0->next = word1->next;
2270 pool->setPool(baseIdx, word1->next);
2272 word1 = word1->next;
2274 blk->addWord(word2);
2275 if (word2->base < minBase) {
2276 minBase = word2->base;
2277 } else if (word2->base > maxBase) {
2278 maxBase = word2->base;
2284 word1 = word1->next;
2292 //~ need to compute the primary writing mode (horiz/vert) in
2293 //~ addition to primary rotation
2295 // coalesce the block, and add it to the list
2296 blk->coalesce(uMap);
2298 lastBlk->next = blk;
2303 count[rot] += blk->charCount;
2304 if (primaryRot < 0 || count[rot] > count[primaryRot]) {
2311 #if 0 // for debugging
2312 printf("*** rotation ***\n");
2313 for (rot = 0; rot < 4; ++rot) {
2314 printf(" %d: %6d\n", rot, count[rot]);
2316 printf(" primary rot = %d\n", primaryRot);
2320 #if 0 // for debugging
2321 printf("*** blocks ***\n");
2322 for (blk = blkList; blk; blk = blk->next) {
2323 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2324 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
2325 for (line = blk->lines; line; line = line->next) {
2326 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2327 line->xMin, line->xMax, line->yMin, line->yMax, line->base);
2328 for (word0 = line->words; word0; word0 = word0->next) {
2329 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2330 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2331 word0->base, word0->fontSize, word0->spaceAfter);
2332 for (i = 0; i < word0->len; ++i) {
2333 fputc(word0->text[i] & 0xff, stdout);
2342 // determine the primary direction
2344 for (blk = blkList; blk; blk = blk->next) {
2345 for (line = blk->lines; line; line = line->next) {
2346 for (word0 = line->words; word0; word0 = word0->next) {
2347 for (i = 0; i < word0->len; ++i) {
2348 if (unicodeTypeL(word0->text[i])) {
2350 } else if (unicodeTypeR(word0->text[i])) {
2357 primaryLR = lrCount >= 0;
2359 #if 0 // for debugging
2360 printf("*** direction ***\n");
2361 printf("lrCount = %d\n", lrCount);
2362 printf("primaryLR = %d\n", primaryLR);
2365 //----- column assignment
2367 // sort blocks into xy order for column assignment
2368 blocks = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *));
2369 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
2372 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
2374 // column assignment
2375 for (i = 0; i < nBlocks; ++i) {
2378 for (j = 0; j < i; ++j) {
2380 col2 = 0; // make gcc happy
2381 switch (primaryRot) {
2383 if (blk0->xMin > blk1->xMax) {
2384 col2 = blk1->col + blk1->nColumns + 3;
2386 col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
2387 (blk1->xMax - blk1->xMin)) *
2392 if (blk0->yMin > blk1->yMax) {
2393 col2 = blk1->col + blk1->nColumns + 3;
2395 col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
2396 (blk1->yMax - blk1->yMin)) *
2401 if (blk0->xMax < blk1->xMin) {
2402 col2 = blk1->col + blk1->nColumns + 3;
2404 col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
2405 (blk1->xMin - blk1->xMax)) *
2410 if (blk0->yMax < blk1->yMin) {
2411 col2 = blk1->col + blk1->nColumns + 3;
2413 col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
2414 (blk1->yMin - blk1->yMax)) *
2424 for (line = blk0->lines; line; line = line->next) {
2425 for (j = 0; j <= line->len; ++j) {
2426 line->col[j] += col1;
2431 #if 0 // for debugging
2432 printf("*** blocks, after column assignment ***\n");
2433 for (blk = blkList; blk; blk = blk->next) {
2434 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2435 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
2437 for (line = blk->lines; line; line = line->next) {
2439 for (word0 = line->words; word0; word0 = word0->next) {
2440 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2441 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2442 word0->base, word0->fontSize, word0->spaceAfter);
2443 for (i = 0; i < word0->len; ++i) {
2444 fputc(word0->text[i] & 0xff, stdout);
2453 //----- reading order sort
2455 // sort blocks into yx order (in preparation for reading order sort)
2456 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot);
2458 // compute space on left and right sides of each block
2459 for (i = 0; i < nBlocks; ++i) {
2461 for (j = 0; j < nBlocks; ++j) {
2464 blk0->updatePriMinMax(blk1);
2469 #if 0 // for debugging
2470 printf("*** blocks, after yx sort ***\n");
2471 for (i = 0; i < nBlocks; ++i) {
2473 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
2474 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2475 blk->priMin, blk->priMax);
2476 for (line = blk->lines; line; line = line->next) {
2478 for (word0 = line->words; word0; word0 = word0->next) {
2479 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2480 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2481 word0->base, word0->fontSize, word0->spaceAfter);
2482 for (j = 0; j < word0->len; ++j) {
2483 fputc(word0->text[j] & 0xff, stdout);
2493 //~ this needs to be adjusted for writing mode (vertical text)
2494 //~ this also needs to account for right-to-left column ordering
2495 blkArray = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *));
2496 memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *));
2497 flows = lastFlow = NULL;
2499 nBlocksLeft = nBlocks;
2500 while (nBlocksLeft > 0) {
2502 // find the upper-left-most block
2503 for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ;
2506 for (j = firstBlkIdx + 1; j < nBlocks; ++j) {
2509 if (blk && blk->secondaryDelta(blk1) > 0) {
2512 if (blk1->primaryCmp(blk) < 0) {
2522 // create a new flow, starting with the upper-left-most block
2523 flow = new TextFlow(this, blk);
2525 lastFlow->next = flow;
2530 fontSize = blk->lines->words->fontSize;
2532 // push the upper-left-most block on the stack
2533 blk->stackNext = NULL;
2536 // find the other blocks in this flow
2539 // find the upper-left-most block under (but within
2540 // maxBlockSpacing of) the top block on the stack
2541 blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize;
2544 for (j = firstBlkIdx; j < nBlocks; ++j) {
2547 if (blkStack->secondaryDelta(blk1) > blkSpace) {
2550 if (blk && blk->secondaryDelta(blk1) > 0) {
2553 if (blk1->isBelow(blkStack) &&
2554 (!blk || blk1->primaryCmp(blk) < 0)) {
2561 // if a suitable block was found, add it to the flow and push it
2563 if (blk && flow->blockFits(blk, blkStack)) {
2567 flow->addBlock(blk);
2568 fontSize = blk->lines->words->fontSize;
2569 blk->stackNext = blkStack;
2572 // otherwise (if there is no block under the top block or the
2573 // block is not suitable), pop the stack
2575 blkStack = blkStack->stackNext;
2581 #if 0 // for debugging
2582 printf("*** flows ***\n");
2583 for (flow = flows; flow; flow = flow->next) {
2584 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
2585 flow->xMin, flow->xMax, flow->yMin, flow->yMax,
2586 flow->priMin, flow->priMax);
2587 for (blk = flow->blocks; blk; blk = blk->next) {
2588 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
2589 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2590 blk->priMin, blk->priMax);
2591 for (line = blk->lines; line; line = line->next) {
2593 for (word0 = line->words; word0; word0 = word0->next) {
2594 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2595 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2596 word0->base, word0->fontSize, word0->spaceAfter);
2597 for (i = 0; i < word0->len; ++i) {
2598 fputc(word0->text[i] & 0xff, stdout);
2613 GBool TextPage::findText(Unicode *s, int len,
2614 GBool startAtTop, GBool stopAtBottom,
2615 GBool startAtLast, GBool stopAtLast,
2616 double *xMin, double *yMin,
2617 double *xMax, double *yMax) {
2623 double xStart, yStart, xStop, yStop;
2624 double xMin0, yMin0, xMax0, yMax0;
2625 double xMin1, yMin1, xMax1, yMax1;
2628 //~ needs to handle right-to-left text
2634 xStart = yStart = xStop = yStop = 0;
2635 if (startAtLast && haveLastFind) {
2636 xStart = lastFindXMin;
2637 yStart = lastFindYMin;
2638 } else if (!startAtTop) {
2642 if (stopAtLast && haveLastFind) {
2643 xStop = lastFindXMin;
2644 yStop = lastFindYMin;
2645 } else if (!stopAtBottom) {
2651 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
2652 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
2654 for (i = 0; i < nBlocks; ++i) {
2657 // check: is the block above the top limit?
2658 if (!startAtTop && blk->yMax < yStart) {
2662 // check: is the block below the bottom limit?
2663 if (!stopAtBottom && blk->yMin > yStop) {
2667 for (line = blk->lines; line; line = line->next) {
2669 // check: is the line above the top limit?
2670 if (!startAtTop && line->yMin < yStart) {
2674 // check: is the line below the bottom limit?
2675 if (!stopAtBottom && line->yMin > yStop) {
2679 // search each position in this line
2681 for (j = 0, p = line->text; j <= m - len; ++j, ++p) {
2683 // compare the strings
2684 for (k = 0; k < len; ++k) {
2685 #if 1 //~ this lowercases Latin A-Z only -- this will eventually be
2686 //~ extended to handle other character sets
2687 if (p[k] >= 0x41 && p[k] <= 0x5a) {
2692 if (s[k] >= 0x41 && s[k] <= 0x5a) {
2705 switch (line->rot) {
2707 xMin1 = line->edge[j];
2708 xMax1 = line->edge[j + len];
2715 yMin1 = line->edge[j];
2716 yMax1 = line->edge[j + len];
2719 xMin1 = line->edge[j + len];
2720 xMax1 = line->edge[j];
2727 yMin1 = line->edge[j + len];
2728 yMax1 = line->edge[j];
2732 yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
2734 yMin1 < yStop || (yMin1 == yStop && xMin1 < yStop))) {
2735 if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
2753 lastFindXMin = xMin0;
2754 lastFindYMin = yMin0;
2755 haveLastFind = gTrue;
2762 GString *TextPage::getText(double xMin, double yMin,
2763 double xMax, double yMax) {
2769 TextLineFrag *frags;
2770 int nFrags, fragsSize;
2772 char space[8], eol[16];
2773 int spaceLen, eolLen;
2776 int col, idx0, idx1, i, j;
2777 GBool multiLine, oneRot;
2785 // get the output encoding
2786 if (!(uMap = globalParams->getTextEncoding())) {
2789 isUnicode = uMap->isUnicode();
2790 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
2791 eolLen = 0; // make gcc happy
2792 switch (globalParams->getTextEOL()) {
2794 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
2797 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
2798 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
2801 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
2805 //~ writing mode (horiz/vert)
2807 // collect the line fragments that are in the rectangle
2809 frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag));
2813 for (i = 0; i < nBlocks; ++i) {
2815 if (xMin < blk->xMax && blk->xMin < xMax &&
2816 yMin < blk->yMax && blk->yMin < yMax) {
2817 for (line = blk->lines; line; line = line->next) {
2818 if (xMin < line->xMax && line->xMin < xMax &&
2819 yMin < line->yMax && line->yMin < yMax) {
2821 switch (line->rot) {
2823 y = 0.5 * (line->yMin + line->yMax);
2824 if (yMin < y && y < yMax) {
2826 while (j < line->len) {
2827 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
2835 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
2844 x = 0.5 * (line->xMin + line->xMax);
2845 if (xMin < x && x < xMax) {
2847 while (j < line->len) {
2848 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
2856 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
2865 y = 0.5 * (line->yMin + line->yMax);
2866 if (yMin < y && y < yMax) {
2868 while (j < line->len) {
2869 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
2877 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
2886 x = 0.5 * (line->xMin + line->xMax);
2887 if (xMin < x && x < xMax) {
2889 while (j < line->len) {
2890 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
2898 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
2907 if (idx0 >= 0 && idx1 >= 0) {
2908 if (nFrags == fragsSize) {
2910 frags = (TextLineFrag *)
2911 grealloc(frags, fragsSize * sizeof(TextLineFrag));
2913 frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
2915 if (lastRot >= 0 && line->rot != lastRot) {
2918 lastRot = line->rot;
2925 // sort the fragments and generate the string
2928 for (i = 0; i < nFrags; ++i) {
2929 frags[i].computeCoords(oneRot);
2931 assignColumns(frags, nFrags, oneRot);
2933 // if all lines in the region have the same rotation, use it;
2934 // otherwise, use the page's primary rotation
2936 qsort(frags, nFrags, sizeof(TextLineFrag),
2937 &TextLineFrag::cmpYXLineRot);
2939 qsort(frags, nFrags, sizeof(TextLineFrag),
2940 &TextLineFrag::cmpYXPrimaryRot);
2945 for (i = 0; i < nFrags; ++i) {
2949 if (frag->col < col ||
2950 (i > 0 && fabs(frag->base - frags[i-1].base) >
2951 maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
2952 s->append(eol, eolLen);
2958 for (; col < frag->col; ++col) {
2959 s->append(space, spaceLen);
2962 // get the fragment text
2963 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
2967 s->append(eol, eolLen);
2977 GBool TextPage::findCharRange(int pos, int length,
2978 double *xMin, double *yMin,
2979 double *xMax, double *yMax) {
2983 double xMin0, xMax0, yMin0, yMax0;
2984 double xMin1, xMax1, yMin1, yMax1;
2992 //~ this doesn't correctly handle:
2993 //~ - ranges split across multiple lines (the highlighted region
2994 //~ is the bounding box of all the parts of the range)
2995 //~ - cases where characters don't convert one-to-one into Unicode
2997 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
2998 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
2999 for (i = 0; i < nBlocks; ++i) {
3001 for (line = blk->lines; line; line = line->next) {
3002 for (word = line->words; word; word = word->next) {
3003 if (pos < word->charPos + word->charLen &&
3004 word->charPos < pos + length) {
3005 j0 = pos - word->charPos;
3009 j1 = pos + length - 1 - word->charPos;
3010 if (j1 >= word->len) {
3013 switch (line->rot) {
3015 xMin1 = word->edge[j0];
3016 xMax1 = word->edge[j1 + 1];
3023 yMin1 = word->edge[j0];
3024 yMax1 = word->edge[j1 + 1];
3027 xMin1 = word->edge[j1 + 1];
3028 xMax1 = word->edge[j0];
3035 yMin1 = word->edge[j1 + 1];
3036 yMax1 = word->edge[j0];
3039 if (first || xMin1 < xMin0) {
3042 if (first || xMax1 > xMax0) {
3045 if (first || yMin1 < yMin0) {
3048 if (first || yMax1 > yMax0) {
3066 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
3072 TextLineFrag *frags;
3074 int nFrags, fragsSize;
3076 char space[8], eol[16], eop[8];
3077 int spaceLen, eolLen, eopLen;
3082 // get the output encoding
3083 if (!(uMap = globalParams->getTextEncoding())) {
3086 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3087 eolLen = 0; // make gcc happy
3088 switch (globalParams->getTextEOL()) {
3090 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3093 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3094 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3097 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3100 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
3101 pageBreaks = globalParams->getTextPageBreaks();
3103 //~ writing mode (horiz/vert)
3105 // output the page in raw (content stream) order
3108 for (word = rawWords; word; word = word->next) {
3110 dumpFragment(word->text, word->len, uMap, s);
3111 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3114 fabs(word->next->base - word->base) <
3115 maxIntraLineDelta * word->fontSize) {
3116 if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
3117 (*outputFunc)(outputStream, space, spaceLen);
3120 (*outputFunc)(outputStream, eol, eolLen);
3124 // output the page, maintaining the original physical layout
3125 } else if (physLayout) {
3127 // collect the line fragments for the page and sort them
3129 frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag));
3131 for (i = 0; i < nBlocks; ++i) {
3133 for (line = blk->lines; line; line = line->next) {
3134 if (nFrags == fragsSize) {
3136 frags = (TextLineFrag *)grealloc(frags,
3137 fragsSize * sizeof(TextLineFrag));
3139 frags[nFrags].init(line, 0, line->len);
3140 frags[nFrags].computeCoords(gTrue);
3144 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
3148 for (i = 0; i < nFrags; ++i) {
3152 for (; col < frag->col; ++col) {
3153 (*outputFunc)(outputStream, space, spaceLen);
3158 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3159 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3162 // print one or more returns if necessary
3163 if (i == nFrags - 1 ||
3164 frags[i+1].col < col ||
3165 fabs(frags[i+1].base - frag->base) >
3166 maxIntraLineDelta * frag->line->words->fontSize) {
3167 if (i < nFrags - 1) {
3168 d = (int)((frags[i+1].base - frag->base) /
3169 frag->line->words->fontSize);
3178 for (; d > 0; --d) {
3179 (*outputFunc)(outputStream, eol, eolLen);
3187 // output the page, "undoing" the layout
3189 for (flow = flows; flow; flow = flow->next) {
3190 for (blk = flow->blocks; blk; blk = blk->next) {
3191 for (line = blk->lines; line; line = line->next) {
3193 if (line->hyphenated && (line->next || blk->next)) {
3197 dumpFragment(line->text, n, uMap, s);
3198 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3200 if (!line->hyphenated) {
3202 (*outputFunc)(outputStream, space, spaceLen);
3203 } else if (blk->next) {
3204 //~ this is a bit of a kludge - we should really do a more
3205 //~ intelligent determination of paragraphs
3206 if (blk->next->lines->words->fontSize ==
3207 blk->lines->words->fontSize) {
3208 (*outputFunc)(outputStream, space, spaceLen);
3210 (*outputFunc)(outputStream, eol, eolLen);
3216 (*outputFunc)(outputStream, eol, eolLen);
3217 (*outputFunc)(outputStream, eol, eolLen);
3223 (*outputFunc)(outputStream, eop, eopLen);
3224 (*outputFunc)(outputStream, eol, eolLen);
3230 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
3231 TextLineFrag *frag0, *frag1;
3232 int rot, col1, col2, i, j, k;
3234 // all text in the region has the same rotation -- recompute the
3235 // column numbers based only on the text in the region
3237 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
3238 rot = frags[0].line->rot;
3239 for (i = 0; i < nFrags; ++i) {
3242 for (j = 0; j < i; ++j) {
3244 col2 = 0; // make gcc happy
3247 if (frag0->xMin >= frag1->xMax) {
3248 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3249 frag1->line->col[frag1->start]) + 1;
3251 for (k = frag1->start;
3252 k < frag1->start + frag1->len &&
3253 frag0->xMin >= 0.5 * (frag1->line->edge[k] +
3254 frag1->line->edge[k+1]);
3257 frag1->line->col[k] - frag1->line->col[frag1->start];
3261 if (frag0->yMin >= frag1->yMax) {
3262 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3263 frag1->line->col[frag1->start]) + 1;
3265 for (k = frag1->start;
3266 k < frag1->start + frag1->len &&
3267 frag0->yMin >= 0.5 * (frag1->line->edge[k] +
3268 frag1->line->edge[k+1]);
3271 frag1->line->col[k] - frag1->line->col[frag1->start];
3275 if (frag0->xMax <= frag1->xMin) {
3276 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3277 frag1->line->col[frag1->start]) + 1;
3279 for (k = frag1->start;
3280 k < frag1->start + frag1->len &&
3281 frag0->xMax <= 0.5 * (frag1->line->edge[k] +
3282 frag1->line->edge[k+1]);
3285 frag1->line->col[k] - frag1->line->col[frag1->start];
3289 if (frag0->yMax <= frag1->yMin) {
3290 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3291 frag1->line->col[frag1->start]) + 1;
3293 for (k = frag1->start;
3294 k < frag1->start + frag1->len &&
3295 frag0->yMax <= 0.5 * (frag1->line->edge[k] +
3296 frag1->line->edge[k+1]);
3299 frag1->line->col[k] - frag1->line->col[frag1->start];
3310 // the region includes text at different rotations -- use the
3311 // globally assigned column numbers, offset by the minimum column
3312 // number (i.e., shift everything over to column 0)
3314 col1 = frags[0].col;
3315 for (i = 1; i < nFrags; ++i) {
3316 if (frags[i].col < col1) {
3317 col1 = frags[i].col;
3320 for (i = 0; i < nFrags; ++i) {
3321 frags[i].col -= col1;
3326 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
3328 char lre[8], rle[8], popdf[8], buf[8];
3329 int lreLen, rleLen, popdfLen, n;
3334 if (uMap->isUnicode()) {
3336 lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
3337 rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
3338 popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
3344 // output a left-to-right section
3345 for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
3346 for (k = i; k < j; ++k) {
3347 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3352 // output a right-to-left section
3353 for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
3355 s->append(rle, rleLen);
3356 for (k = j - 1; k >= i; --k) {
3357 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3361 s->append(popdf, popdfLen);
3368 s->append(rle, rleLen);
3371 // output a right-to-left section
3372 for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
3373 for (k = i; k > j; --k) {
3374 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3379 // output a left-to-right section
3380 for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
3382 s->append(lre, lreLen);
3383 for (k = j + 1; k <= i; ++k) {
3384 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3388 s->append(popdf, popdfLen);
3392 s->append(popdf, popdfLen);
3397 for (i = 0; i < len; ++i) {
3398 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
3407 #if TEXTOUT_WORD_LIST
3408 TextWordList *TextPage::makeWordList(GBool physLayout) {
3409 return new TextWordList(this, physLayout);
3413 //------------------------------------------------------------------------
3415 //------------------------------------------------------------------------
3417 static void outputToFile(void *stream, char *text, int len) {
3418 fwrite(text, 1, len, (FILE *)stream);
3421 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
3422 GBool rawOrderA, GBool append) {
3424 physLayout = physLayoutA;
3425 rawOrder = rawOrderA;
3431 if (!strcmp(fileName, "-")) {
3432 outputStream = stdout;
3434 // keep DOS from munging the end-of-line characters
3435 setmode(fileno(stdout), O_BINARY);
3437 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
3440 error(-1, "Couldn't open text file '%s'", fileName);
3444 outputFunc = &outputToFile;
3446 outputStream = NULL;
3449 // set up text object
3450 text = new TextPage(rawOrderA);
3453 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
3454 GBool physLayoutA, GBool rawOrderA) {
3456 outputStream = stream;
3458 physLayout = physLayoutA;
3459 rawOrder = rawOrderA;
3460 text = new TextPage(rawOrderA);
3464 TextOutputDev::~TextOutputDev() {
3467 ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
3469 fclose((FILE *)outputStream);
3476 void TextOutputDev::startPage(int pageNum, GfxState *state) {
3477 text->startPage(state);
3480 void TextOutputDev::endPage() {
3482 text->coalesce(physLayout);
3484 text->dump(outputStream, outputFunc, physLayout);
3488 void TextOutputDev::updateFont(GfxState *state) {
3489 text->updateFont(state);
3492 void TextOutputDev::beginString(GfxState *state, GString *s) {
3495 void TextOutputDev::endString(GfxState *state) {
3498 void TextOutputDev::drawChar(GfxState *state, double x, double y,
3499 double dx, double dy,
3500 double originX, double originY,
3501 CharCode c, Unicode *u, int uLen) {
3502 text->addChar(state, x, y, dx, dy, c, u, uLen);
3505 GBool TextOutputDev::findText(Unicode *s, int len,
3506 GBool startAtTop, GBool stopAtBottom,
3507 GBool startAtLast, GBool stopAtLast,
3508 double *xMin, double *yMin,
3509 double *xMax, double *yMax) {
3510 return text->findText(s, len, startAtTop, stopAtBottom,
3511 startAtLast, stopAtLast, xMin, yMin, xMax, yMax);
3514 GString *TextOutputDev::getText(double xMin, double yMin,
3515 double xMax, double yMax) {
3516 return text->getText(xMin, yMin, xMax, yMax);
3519 GBool TextOutputDev::findCharRange(int pos, int length,
3520 double *xMin, double *yMin,
3521 double *xMax, double *yMax) {
3522 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
3525 #if TEXTOUT_WORD_LIST
3526 TextWordList *TextOutputDev::makeWordList() {
3527 return text->makeWordList(physLayout);