1 //========================================================================
5 // Copyright 1997-2003 Glyph & Cog, LLC
7 //========================================================================
9 #ifndef TEXTOUTPUTDEV_H
10 #define TEXTOUTPUTDEV_H
14 #ifdef USE_GCC_PRAGMAS
21 #include "OutputDev.h"
29 //------------------------------------------------------------------------
31 typedef void (*TextOutputFunc)(void *stream, char *text, int len);
33 //------------------------------------------------------------------------
35 //------------------------------------------------------------------------
40 TextFontInfo(GfxState *state);
43 GBool matches(GfxState *state);
52 friend class TextWord;
53 friend class TextPage;
56 //------------------------------------------------------------------------
58 //------------------------------------------------------------------------
64 TextWord(GfxState *state, int rotA, double x0, double y0,
65 int charPosA, TextFontInfo *fontA, double fontSize);
70 // Add a character to the word.
71 void addChar(GfxState *state, double x, double y,
72 double dx, double dy, Unicode u);
74 // Merge <word> onto the end of <this>.
75 void merge(TextWord *word);
77 // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
78 // based on a primary-axis comparison, e.g., x ordering if rot=0.
79 int primaryCmp(TextWord *word);
81 // Return the distance along the primary axis between <this> and
83 double primaryDelta(TextWord *word);
85 static int cmpYX(const void *p1, const void *p2);
88 int getLength() { return len; }
89 Unicode getChar(int idx) { return text[idx]; }
91 GString *getFontName() { return font->fontName; }
92 void getColor(double *r, double *g, double *b)
93 { *r = colorR; *g = colorG; *b = colorB; }
94 void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
95 { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
96 int getCharPos() { return charPos; }
97 int getCharLen() { return charLen; }
102 int rot; // rotation, multiple of 90 degrees
104 double xMin, xMax; // bounding box x coordinates
105 double yMin, yMax; // bounding box y coordinates
106 double base; // baseline x or y coordinate
107 Unicode *text; // the text
108 double *edge; // "near" edge x or y coord of each char
109 // (plus one extra entry for the last char)
110 int len; // length of text and edge arrays
111 int size; // size of text and edge arrays
112 int charPos; // character position (within content stream)
113 int charLen; // number of content stream characters in
115 TextFontInfo *font; // font information
116 double fontSize; // font size
117 GBool spaceAfter; // set if there is a space between this
118 // word and the next word on the line
119 TextWord *next; // next word in line
121 #if TEXTOUT_WORD_LIST
122 double colorR, // word color
127 friend class TextPool;
128 friend class TextLine;
129 friend class TextBlock;
130 friend class TextFlow;
131 friend class TextWordList;
132 friend class TextPage;
135 //------------------------------------------------------------------------
137 //------------------------------------------------------------------------
145 TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
146 void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
148 int getBaseIdx(double base);
150 void addWord(TextWord *word);
154 int minBaseIdx; // min baseline bucket index
155 int maxBaseIdx; // max baseline bucket index
156 TextWord **pool; // array of linked lists, one for each
157 // baseline value (multiple of 4 pts)
158 TextWord *cursor; // pointer to last-accessed word
159 int cursorBaseIdx; // baseline bucket index of last-accessed word
161 friend class TextBlock;
162 friend class TextPage;
165 //------------------------------------------------------------------------
167 //------------------------------------------------------------------------
172 TextLine(TextBlock *blkA, int rotA, double baseA);
175 void addWord(TextWord *word);
177 // Return the distance along the primary axis between <this> and
179 double primaryDelta(TextLine *line);
181 // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
182 // based on a primary-axis comparison, e.g., x ordering if rot=0.
183 int primaryCmp(TextLine *line);
185 // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
186 // based on a secondary-axis comparison of the baselines, e.g., y
187 // ordering if rot=0.
188 int secondaryCmp(TextLine *line);
190 int cmpYX(TextLine *line);
192 static int cmpXY(const void *p1, const void *p2);
194 void coalesce(UnicodeMap *uMap);
198 TextBlock *blk; // parent block
199 int rot; // text rotation
200 double xMin, xMax; // bounding box x coordinates
201 double yMin, yMax; // bounding box y coordinates
202 double base; // baseline x or y coordinate
203 TextWord *words; // words in this line
204 TextWord *lastWord; // last word in this line
205 Unicode *text; // Unicode text of the line, including
206 // spaces between words
207 double *edge; // "near" edge x or y coord of each char
208 // (plus one extra entry for the last char)
209 int *col; // starting column number of each Unicode char
210 int len; // number of Unicode chars
211 int convertedLen; // total number of converted characters
212 GBool hyphenated; // set if last char is a hyphen
213 TextLine *next; // next line in block
215 friend class TextLineFrag;
216 friend class TextBlock;
217 friend class TextFlow;
218 friend class TextWordList;
219 friend class TextPage;
222 //------------------------------------------------------------------------
224 //------------------------------------------------------------------------
229 TextBlock(TextPage *pageA, int rotA);
232 void addWord(TextWord *word);
234 void coalesce(UnicodeMap *uMap);
236 // Update this block's priMin and priMax values, looking at <blk>.
237 void updatePriMinMax(TextBlock *blk);
239 static int cmpXYPrimaryRot(const void *p1, const void *p2);
241 static int cmpYXPrimaryRot(const void *p1, const void *p2);
243 int primaryCmp(TextBlock *blk);
245 double secondaryDelta(TextBlock *blk);
247 // Returns true if <this> is below <blk>, relative to the page's
249 GBool isBelow(TextBlock *blk);
253 TextPage *page; // the parent page
254 int rot; // text rotation
255 double xMin, xMax; // bounding box x coordinates
256 double yMin, yMax; // bounding box y coordinates
257 double priMin, priMax; // whitespace bounding box along primary axis
259 TextPool *pool; // pool of words (used only until lines
261 TextLine *lines; // linked list of lines
262 TextLine *curLine; // most recently added line
263 int nLines; // number of lines
264 int charCount; // number of characters in the block
265 int col; // starting column
266 int nColumns; // number of columns in the block
269 TextBlock *stackNext;
271 friend class TextLine;
272 friend class TextLineFrag;
273 friend class TextFlow;
274 friend class TextWordList;
275 friend class TextPage;
278 //------------------------------------------------------------------------
280 //------------------------------------------------------------------------
285 TextFlow(TextPage *pageA, TextBlock *blk);
288 // Add a block to the end of this flow.
289 void addBlock(TextBlock *blk);
291 // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
292 // it uses a font no larger than the last block added to the flow,
293 // and (2) it fits within the flow's [priMin, priMax] along the
295 GBool blockFits(TextBlock *blk, TextBlock *prevBlk);
299 TextPage *page; // the parent page
300 double xMin, xMax; // bounding box x coordinates
301 double yMin, yMax; // bounding box y coordinates
302 double priMin, priMax; // whitespace bounding box along primary axis
303 TextBlock *blocks; // blocks in flow
304 TextBlock *lastBlk; // last block in this flow
307 friend class TextWordList;
308 friend class TextPage;
311 #if TEXTOUT_WORD_LIST
313 //------------------------------------------------------------------------
315 //------------------------------------------------------------------------
320 // Build a flat word list, in content stream order (if
321 // text->rawOrder is true), physical layout order (if <physLayout>
322 // is true and text->rawOrder is false), or reading order (if both
324 TextWordList(TextPage *text, GBool physLayout);
328 // Return the number of words on the list.
331 // Return the <idx>th word from the list.
332 TextWord *get(int idx);
339 #endif // TEXTOUT_WORD_LIST
341 //------------------------------------------------------------------------
343 //------------------------------------------------------------------------
349 TextPage(GBool rawOrderA);
355 void startPage(GfxState *state);
357 // End the current page.
360 // Update the current font.
361 void updateFont(GfxState *state);
364 void beginWord(GfxState *state, double x0, double y0);
366 // Add a character to the current word.
367 void addChar(GfxState *state, double x, double y,
368 double dx, double dy,
369 CharCode c, Unicode *u, int uLen);
371 // End the current word, sorting it into the list of words.
374 // Add a word, sorting it into the list of words.
375 void addWord(TextWord *word);
377 // Coalesce strings that look like parts of the same line.
378 void coalesce(GBool physLayout);
380 // Find a string. If <startAtTop> is true, starts looking at the
381 // top of the page; else if <startAtLast> is true, starts looking
382 // immediately after the last find result; else starts looking at
383 // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
384 // bottom of the page; else if <stopAtLast> is true, stops looking
385 // just before the last find result; else stops looking at
387 GBool findText(Unicode *s, int len,
388 GBool startAtTop, GBool stopAtBottom,
389 GBool startAtLast, GBool stopAtLast,
390 double *xMin, double *yMin,
391 double *xMax, double *yMax);
393 // Get the text which is inside the specified rectangle.
394 GString *getText(double xMin, double yMin,
395 double xMax, double yMax);
397 // Find a string by character position and length. If found, sets
398 // the text bounding rectangle and returns true; otherwise returns
400 GBool findCharRange(int pos, int length,
401 double *xMin, double *yMin,
402 double *xMax, double *yMax);
404 // Dump contents of page to a file.
405 void dump(void *outputStream, TextOutputFunc outputFunc,
408 #if TEXTOUT_WORD_LIST
409 // Build a flat word list, in content stream order (if
410 // this->rawOrder is true), physical layout order (if <physLayout>
411 // is true and this->rawOrder is false), or reading order (if both
413 TextWordList *makeWordList(GBool physLayout);
419 void assignColumns(TextLineFrag *frags, int nFrags, int rot);
420 int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GString *s);
422 GBool rawOrder; // keep text in content stream order
424 double pageWidth, pageHeight; // width and height of current page
425 TextWord *curWord; // currently active string
426 int charPos; // next character position (within content
428 TextFontInfo *curFont; // current font
429 double curFontSize; // current font size
430 int nest; // current nesting level (for Type 3 fonts)
431 int nTinyChars; // number of "tiny" chars seen so far
432 GBool lastCharOverlap; // set if the last added char overlapped the
435 TextPool *pools[4]; // a "pool" of TextWords for each rotation
436 TextFlow *flows; // linked list of flows
437 TextBlock **blocks; // array of blocks, in yx order
438 int nBlocks; // number of blocks
439 int primaryRot; // primary rotation
440 GBool primaryLR; // primary direction (true means L-to-R,
441 // false means R-to-L)
442 TextWord *rawWords; // list of words, in raw order (only if
444 TextWord *rawLastWord; // last word on rawWords list
446 GList *fonts; // all font info objects used on this
447 // page [TextFontInfo]
449 double lastFindXMin, // coordinates of the last "find" result
453 friend class TextLine;
454 friend class TextLineFrag;
455 friend class TextBlock;
456 friend class TextFlow;
457 friend class TextWordList;
460 //------------------------------------------------------------------------
462 //------------------------------------------------------------------------
464 class TextOutputDev: public OutputDev {
467 // Open a text output file. If <fileName> is NULL, no file is
468 // written (this is useful, e.g., for searching text). If
469 // <physLayoutA> is true, the original physical layout of the text
470 // is maintained. If <rawOrder> is true, the text is kept in
471 // content stream order.
472 TextOutputDev(char *fileName, GBool physLayoutA,
473 GBool rawOrderA, GBool append);
475 // Create a TextOutputDev which will write to a generic stream. If
476 // <physLayoutA> is true, the original physical layout of the text
477 // is maintained. If <rawOrder> is true, the text is kept in
478 // content stream order.
479 TextOutputDev(TextOutputFunc func, void *stream,
480 GBool physLayoutA, GBool rawOrderA);
483 virtual ~TextOutputDev();
485 // Check if file was successfully created.
486 virtual GBool isOk() { return ok; }
488 //---- get info about output device
490 // Does this device use upside-down coordinates?
491 // (Upside-down means (0,0) is the top left corner of the page.)
492 virtual GBool upsideDown() { return gTrue; }
494 // Does this device use drawChar() or drawString()?
495 virtual GBool useDrawChar() { return gTrue; }
497 // Does this device use beginType3Char/endType3Char? Otherwise,
498 // text in Type 3 fonts will be drawn with drawChar/drawString.
499 virtual GBool interpretType3Chars() { return gFalse; }
501 // Does this device need non-text content?
502 virtual GBool needNonText() { return gFalse; }
504 //----- initialization and control
507 virtual void startPage(int pageNum, GfxState *state);
510 virtual void endPage();
512 //----- update text state
513 virtual void updateFont(GfxState *state);
516 virtual void beginString(GfxState *state, GString *s);
517 virtual void endString(GfxState *state);
518 virtual void drawChar(GfxState *state, double x, double y,
519 double dx, double dy,
520 double originX, double originY,
521 CharCode c, Unicode *u, int uLen);
523 //----- special access
525 // Find a string. If <startAtTop> is true, starts looking at the
526 // top of the page; else if <startAtLast> is true, starts looking
527 // immediately after the last find result; else starts looking at
528 // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the
529 // bottom of the page; else if <stopAtLast> is true, stops looking
530 // just before the last find result; else stops looking at
532 GBool findText(Unicode *s, int len,
533 GBool startAtTop, GBool stopAtBottom,
534 GBool startAtLast, GBool stopAtLast,
535 double *xMin, double *yMin,
536 double *xMax, double *yMax);
538 // Get the text which is inside the specified rectangle.
539 GString *getText(double xMin, double yMin,
540 double xMax, double yMax);
542 // Find a string by character position and length. If found, sets
543 // the text bounding rectangle and returns true; otherwise returns
545 GBool findCharRange(int pos, int length,
546 double *xMin, double *yMin,
547 double *xMax, double *yMax);
549 #if TEXTOUT_WORD_LIST
550 // Build a flat word list, in content stream order (if
551 // this->rawOrder is true), physical layout order (if
552 // this->physLayout is true and this->rawOrder is false), or reading
553 // order (if both flags are false).
554 TextWordList *makeWordList();
559 TextOutputFunc outputFunc; // output function
560 void *outputStream; // output stream
561 GBool needClose; // need to close the output file?
562 // (only if outputStream is a FILE*)
563 TextPage *text; // text for the current page
564 GBool physLayout; // maintain original physical layout when
566 GBool rawOrder; // keep text in content stream order
567 GBool ok; // set up ok?