1 //========================================================================
5 // Copyright 1997-2003 Glyph & Cog, LLC
7 //========================================================================
9 #ifndef TEXTOUTPUTDEV_H
10 #define TEXTOUTPUTDEV_H
14 #ifdef USE_GCC_PRAGMAS
21 #include "OutputDev.h"
28 //------------------------------------------------------------------------
30 typedef void (*TextOutputFunc)(void *stream, char *text, int len);
33 //------------------------------------------------------------------------
35 //------------------------------------------------------------------------
40 TextFontInfo(GfxState *state);
43 GBool matches(GfxState *state);
50 double minSpaceWidth; // min width for inter-word space, as a
51 // fraction of the font size
52 double maxSpaceWidth; // max width for inter-word space, as a
53 // fraction of the font size
56 friend class TextWord;
57 friend class TextPage;
60 //------------------------------------------------------------------------
62 //------------------------------------------------------------------------
68 TextWord(GfxState *state, double x0, double y0, int charPosA,
69 TextFontInfo *fontA, double fontSize);
75 // Add a character to the word.
76 void addChar(GfxState *state, double x, double y,
77 double dx, double dy, Unicode u);
82 GBool xyBefore(TextWord *word2);
83 void merge(TextWord *word2);
85 double xMin, xMax; // bounding box x coordinates
86 double yMin, yMax; // bounding box y coordinates
87 double yBase; // baseline y coordinate
88 Unicode *text; // the text
89 double *xRight; // right-hand x coord of each char
90 int len; // length of text and xRight
91 int size; // size of text and xRight arrays
92 int charPos; // character position (within content stream)
93 int charLen; // number of content stream characters in
95 TextFontInfo *font; // font information
96 double fontSize; // font size
97 GBool spaceAfter; // set if there is a space between this
98 // word and the next word on the line
99 TextWord *next; // next word in line (before lines are
100 // assembled: next word in xy order)
103 friend class TextLine;
104 friend class TextPage;
107 //------------------------------------------------------------------------
109 //------------------------------------------------------------------------
119 GBool yxBefore(TextLine *line2);
120 void merge(TextLine *line2);
122 double xMin, xMax; // bounding box x coordinates
123 double yMin, yMax; // bounding box y coordinates
124 double yBase; // primary baseline y coordinate
125 double xSpaceL, xSpaceR; // whitespace to left and right of this line
126 TextFontInfo *font; // primary font
127 double fontSize; // primary font size
128 TextWord *words; // words in this line
129 TextWord *lastWord; // last word in this line
130 Unicode *text; // Unicode text of the line, including
131 // spaces between words
132 double *xRight; // right-hand x coord of each Unicode char
133 int *col; // starting column number of each Unicode char
134 int len; // number of Unicode chars
135 int convertedLen; // total number of converted characters
136 GBool hyphenated; // set if last char is a hyphen
137 TextLine *pageNext; // next line on page
138 TextLine *next; // next line in block
139 TextLine *flowNext; // next line in flow
141 friend class TextBlock;
142 friend class TextPage;
145 //------------------------------------------------------------------------
147 //------------------------------------------------------------------------
157 GBool yxBefore(TextBlock *blk2);
158 void mergeRight(TextBlock *blk2);
159 void mergeBelow(TextBlock *blk2);
161 double xMin, xMax; // bounding box x coordinates
162 double yMin, yMax; // bounding box y coordinates
163 double xSpaceL, xSpaceR; // whitespace to left and right of this block
164 double ySpaceT, ySpaceB; // whitespace above and below this block
165 double maxFontSize; // max primary font size
166 TextLine *lines; // lines in block
167 TextBlock *next; // next block in flow
168 TextBlock *stackNext; // next block on traversal stack
170 friend class TextFlow;
171 friend class TextPage;
174 //------------------------------------------------------------------------
176 //------------------------------------------------------------------------
186 double yMin, yMax; // bounding box y coordinates
187 double ySpaceT, ySpaceB; // whitespace above and below this flow
188 TextBlock *blocks; // blocks in flow
189 TextLine *lines; // lines in flow
190 TextFlow *next; // next flow on page
192 friend class TextPage;
196 //------------------------------------------------------------------------
198 //------------------------------------------------------------------------
204 TextPage(GBool rawOrder);
209 // Update the current font.
210 void updateFont(GfxState *state);
214 void beginWord(GfxState *state, double x0, double y0);
216 // Add a character to the current word.
217 void addChar(GfxState *state, double x, double y,
218 double dx, double dy,
219 CharCode c, Unicode *u, int uLen);
221 // End the current word, sorting it into the list of words.
224 // Add a word, sorting it into the list of words.
225 void addWord(TextWord *word);
228 // Coalesce strings that look like parts of the same line.
229 void coalesce(GBool physLayout);
231 // Find a string. If <top> is true, starts looking at top of page;
232 // otherwise starts looking at <xMin>,<yMin>. If <bottom> is true,
233 // stops looking at bottom of page; otherwise stops looking at
234 // <xMax>,<yMax>. If found, sets the text bounding rectangle and
235 // returns true; otherwise returns false.
236 GBool findText(Unicode *s, int len,
237 GBool top, GBool bottom,
238 double *xMin, double *yMin,
239 double *xMax, double *yMax);
241 // Get the text which is inside the specified rectangle.
242 GString *getText(double xMin, double yMin,
243 double xMax, double yMax);
245 // Find a string by character position and length. If found, sets
246 // the text bounding rectangle and returns true; otherwise returns
248 GBool findCharRange(int pos, int length,
249 double *xMin, double *yMin,
250 double *xMax, double *yMax);
252 // Dump contents of page to a file.
253 void dump(void *outputStream, TextOutputFunc outputFunc,
257 void startPage(GfxState *state);
263 double lineFit(TextLine *line, TextWord *word, double *space);
264 GBool lineFit2(TextLine *line0, TextLine *line1);
265 GBool blockFit(TextBlock *blk, TextLine *line);
266 GBool blockFit2(TextBlock *blk0, TextBlock *blk1);
267 GBool flowFit(TextFlow *flow, TextBlock *blk);
269 GBool rawOrder; // keep text in content stream order
271 double pageWidth, pageHeight; // width and height of current page
272 TextWord *curWord; // currently active string
273 int charPos; // next character position (within content
275 TextFontInfo *font; // current font
276 double fontSize; // current font size
277 int nest; // current nesting level (for Type 3 fonts)
278 int nTinyChars; // number of "tiny" chars seen so far
280 TextWord *words; // words, in xy order (before they're
281 // sorted into lines)
282 TextWord *wordPtr; // cursor for the word list
284 TextLine *lines; // lines, in xy order
285 TextFlow *flows; // flows, in reading order
287 GList *fonts; // all font info objects used on this
288 // page [TextFontInfo]
293 //------------------------------------------------------------------------
295 //------------------------------------------------------------------------
297 class TextOutputDev: public OutputDev {
300 // Open a text output file. If <fileName> is NULL, no file is
301 // written (this is useful, e.g., for searching text). If
302 // <physLayoutA> is true, the original physical layout of the text
303 // is maintained. If <rawOrder> is true, the text is kept in
304 // content stream order.
305 TextOutputDev(char *fileName, GBool physLayoutA,
306 GBool rawOrderA, GBool append);
308 // Create a TextOutputDev which will write to a generic stream. If
309 // <physLayoutA> is true, the original physical layout of the text
310 // is maintained. If <rawOrder> is true, the text is kept in
311 // content stream order.
312 TextOutputDev(TextOutputFunc func, void *stream,
313 GBool physLayoutA, GBool rawOrderA);
316 virtual ~TextOutputDev();
318 // Check if file was successfully created.
319 virtual GBool isOk() { return ok; }
321 //---- get info about output device
323 // Does this device use upside-down coordinates?
324 // (Upside-down means (0,0) is the top left corner of the page.)
325 virtual GBool upsideDown() { return gTrue; }
327 // Does this device use drawChar() or drawString()?
328 virtual GBool useDrawChar() { return gTrue; }
330 // Does this device use beginType3Char/endType3Char? Otherwise,
331 // text in Type 3 fonts will be drawn with drawChar/drawString.
332 virtual GBool interpretType3Chars() { return gFalse; }
334 // Does this device need non-text content?
335 virtual GBool needNonText() { return gFalse; }
337 //----- initialization and control
340 virtual void startPage(int pageNum, GfxState *state);
343 virtual void endPage();
345 //----- update text state
346 virtual void updateFont(GfxState *state);
349 virtual void beginString(GfxState *state, GString *s);
350 virtual void endString(GfxState *state);
351 virtual void drawChar(GfxState *state, double x, double y,
352 double dx, double dy,
353 double originX, double originY,
354 CharCode c, Unicode *u, int uLen);
356 //----- path painting
358 //----- special access
360 // Find a string. If <top> is true, starts looking at top of page;
361 // otherwise starts looking at <xMin>,<yMin>. If <bottom> is true,
362 // stops looking at bottom of page; otherwise stops looking at
363 // <xMax>,<yMax>. If found, sets the text bounding rectangle and
364 // returns true; otherwise returns false.
365 GBool findText(Unicode *s, int len,
366 GBool top, GBool bottom,
367 double *xMin, double *yMin,
368 double *xMax, double *yMax);
370 // Get the text which is inside the specified rectangle.
371 GString *getText(double xMin, double yMin,
372 double xMax, double yMax);
374 // Find a string by character position and length. If found, sets
375 // the text bounding rectangle and returns true; otherwise returns
377 GBool findCharRange(int pos, int length,
378 double *xMin, double *yMin,
379 double *xMax, double *yMax);
384 TextOutputFunc outputFunc; // output function
385 void *outputStream; // output stream
386 GBool needClose; // need to close the output file?
387 // (only if outputStream is a FILE*)
388 TextPage *text; // text for the current page
389 GBool physLayout; // maintain original physical layout when
391 GBool rawOrder; // keep text in content stream order
392 GBool ok; // set up ok?