1 //========================================================================
5 // Copyright 1997-2002 Glyph & Cog, LLC
7 //========================================================================
9 #ifndef TEXTOUTPUTDEV_H
10 #define TEXTOUTPUTDEV_H
14 #ifdef USE_GCC_PRAGMAS
21 #include "OutputDev.h"
28 //------------------------------------------------------------------------
30 typedef void (*TextOutputFunc)(void *stream, char *text, int len);
33 //------------------------------------------------------------------------
35 //------------------------------------------------------------------------
40 TextFontInfo(GfxState *state);
43 GBool matches(GfxState *state);
50 double minSpaceWidth; // min width for inter-word space, as a
51 // fraction of the font size
52 double maxSpaceWidth; // max width for inter-word space, as a
53 // fraction of the font size
56 friend class TextWord;
57 friend class TextPage;
60 //------------------------------------------------------------------------
62 //------------------------------------------------------------------------
68 TextWord(GfxState *state, double x0, double y0,
69 TextFontInfo *fontA, double fontSize);
75 // Add a character to the word.
76 void addChar(GfxState *state, double x, double y,
77 double dx, double dy, Unicode u);
82 GBool xyBefore(TextWord *word2);
83 void merge(TextWord *word2);
85 double xMin, xMax; // bounding box x coordinates
86 double yMin, yMax; // bounding box y coordinates
87 double yBase; // baseline y coordinate
88 Unicode *text; // the text
89 double *xRight; // right-hand x coord of each char
90 int len; // length of text and xRight
91 int size; // size of text and xRight arrays
92 TextFontInfo *font; // font information
93 double fontSize; // font size
94 GBool spaceAfter; // set if there is a space between this
95 // word and the next word on the line
96 TextWord *next; // next word in line (before lines are
97 // assembled: next word in xy order)
100 friend class TextLine;
101 friend class TextPage;
104 //------------------------------------------------------------------------
106 //------------------------------------------------------------------------
116 GBool yxBefore(TextLine *line2);
117 void merge(TextLine *line2);
119 double xMin, xMax; // bounding box x coordinates
120 double yMin, yMax; // bounding box y coordinates
121 double yBase; // primary baseline y coordinate
122 double xSpaceL, xSpaceR; // whitespace to left and right of this line
123 TextFontInfo *font; // primary font
124 double fontSize; // primary font size
125 TextWord *words; // words in this line
126 Unicode *text; // Unicode text of the line, including
127 // spaces between words
128 double *xRight; // right-hand x coord of each Unicode char
129 int *col; // starting column number of each Unicode char
130 int len; // number of Unicode chars
131 int convertedLen; // total number of converted characters
132 GBool hyphenated; // set if last char is a hyphen
133 TextLine *pageNext; // next line on page
134 TextLine *next; // next line in block
135 TextLine *flowNext; // next line in flow
137 friend class TextBlock;
138 friend class TextPage;
141 //------------------------------------------------------------------------
143 //------------------------------------------------------------------------
153 GBool yxBefore(TextBlock *blk2);
154 void mergeRight(TextBlock *blk2);
155 void mergeBelow(TextBlock *blk2);
157 double xMin, xMax; // bounding box x coordinates
158 double yMin, yMax; // bounding box y coordinates
159 double xSpaceL, xSpaceR; // whitespace to left and right of this block
160 double ySpaceT, ySpaceB; // whitespace above and below this block
161 double maxFontSize; // max primary font size
162 TextLine *lines; // lines in block
163 TextBlock *next; // next block in flow
164 TextBlock *stackNext; // next block on traversal stack
166 friend class TextFlow;
167 friend class TextPage;
170 //------------------------------------------------------------------------
172 //------------------------------------------------------------------------
182 double yMin, yMax; // bounding box y coordinates
183 double ySpaceT, ySpaceB; // whitespace above and below this flow
184 TextBlock *blocks; // blocks in flow
185 TextLine *lines; // lines in flow
186 TextFlow *next; // next flow on page
188 friend class TextPage;
192 //------------------------------------------------------------------------
194 //------------------------------------------------------------------------
200 TextPage(GBool rawOrder);
205 // Update the current font.
206 void updateFont(GfxState *state);
210 void beginWord(GfxState *state, double x0, double y0);
212 // Add a character to the current word.
213 void addChar(GfxState *state, double x, double y,
214 double dx, double dy,
215 CharCode c, Unicode *u, int uLen);
217 // End the current word, sorting it into the list of words.
220 // Add a word, sorting it into the list of words.
221 void addWord(TextWord *word);
224 // Coalesce strings that look like parts of the same line.
227 // Find a string. If <top> is true, starts looking at top of page;
228 // otherwise starts looking at <xMin>,<yMin>. If <bottom> is true,
229 // stops looking at bottom of page; otherwise stops looking at
230 // <xMax>,<yMax>. If found, sets the text bounding rectange and
231 // returns true; otherwise returns false.
232 GBool findText(Unicode *s, int len,
233 GBool top, GBool bottom,
234 double *xMin, double *yMin,
235 double *xMax, double *yMax);
237 // Get the text which is inside the specified rectangle.
238 GString *getText(double xMin, double yMin,
239 double xMax, double yMax);
241 // Dump contents of page to a file.
242 void dump(void *outputStream, TextOutputFunc outputFunc,
246 void startPage(GfxState *state);
252 double lineFit(TextLine *line, TextWord *lastWord, TextWord *word);
253 GBool lineFit2(TextLine *line0, TextLine *line1);
254 GBool blockFit(TextBlock *blk, TextLine *line);
255 GBool blockFit2(TextBlock *blk0, TextBlock *blk1);
256 GBool flowFit(TextFlow *flow, TextBlock *blk);
258 GBool rawOrder; // keep text in content stream order
260 double pageWidth, pageHeight; // width and height of current page
261 TextWord *curWord; // currently active string
262 TextFontInfo *font; // current font
263 double fontSize; // current font size
264 int nest; // current nesting level (for Type 3 fonts)
265 int nTinyChars; // number of "tiny" chars seen so far
267 TextWord *words; // words, in xy order (before they're
268 // sorted into lines)
269 TextWord *wordPtr; // cursor for the word list
271 TextLine *lines; // lines, in xy order
272 TextFlow *flows; // flows, in reading order
274 GList *fonts; // all font info objects used on this
275 // page [TextFontInfo]
280 //------------------------------------------------------------------------
282 //------------------------------------------------------------------------
284 class TextOutputDev: public OutputDev {
287 // Open a text output file. If <fileName> is NULL, no file is
288 // written (this is useful, e.g., for searching text). If
289 // <physLayoutA> is true, the original physical layout of the text
290 // is maintained. If <rawOrder> is true, the text is kept in
291 // content stream order.
292 TextOutputDev(char *fileName, GBool physLayoutA,
293 GBool rawOrderA, GBool append);
295 // Create a TextOutputDev which will write to a generic stream. If
296 // <physLayoutA> is true, the original physical layout of the text
297 // is maintained. If <rawOrder> is true, the text is kept in
298 // content stream order.
299 TextOutputDev(TextOutputFunc func, void *stream,
300 GBool physLayoutA, GBool rawOrderA);
303 virtual ~TextOutputDev();
305 // Check if file was successfully created.
306 virtual GBool isOk() { return ok; }
308 //---- get info about output device
310 // Does this device use upside-down coordinates?
311 // (Upside-down means (0,0) is the top left corner of the page.)
312 virtual GBool upsideDown() { return gTrue; }
314 // Does this device use drawChar() or drawString()?
315 virtual GBool useDrawChar() { return gTrue; }
317 // Does this device use beginType3Char/endType3Char? Otherwise,
318 // text in Type 3 fonts will be drawn with drawChar/drawString.
319 virtual GBool interpretType3Chars() { return gFalse; }
321 // Does this device need non-text content?
322 virtual GBool needNonText() { return gFalse; }
324 //----- initialization and control
327 virtual void startPage(int pageNum, GfxState *state);
330 virtual void endPage();
332 //----- update text state
333 virtual void updateFont(GfxState *state);
336 virtual void beginString(GfxState *state, GString *s);
337 virtual void endString(GfxState *state);
338 virtual void drawChar(GfxState *state, double x, double y,
339 double dx, double dy,
340 double originX, double originY,
341 CharCode c, Unicode *u, int uLen);
343 //----- path painting
345 //----- special access
347 // Find a string. If <top> is true, starts looking at top of page;
348 // otherwise starts looking at <xMin>,<yMin>. If <bottom> is true,
349 // stops looking at bottom of page; otherwise stops looking at
350 // <xMax>,<yMax>. If found, sets the text bounding rectange and
351 // returns true; otherwise returns false.
352 GBool findText(Unicode *s, int len,
353 GBool top, GBool bottom,
354 double *xMin, double *yMin,
355 double *xMax, double *yMax);
357 // Get the text which is inside the specified rectangle.
358 GString *getText(double xMin, double yMin,
359 double xMax, double yMax);
364 TextOutputFunc outputFunc; // output function
365 void *outputStream; // output stream
366 GBool needClose; // need to close the output file?
367 // (only if outputStream is a FILE*)
368 TextPage *text; // text for the current page
369 GBool physLayout; // maintain original physical layout when
371 GBool rawOrder; // keep text in content stream order
372 GBool ok; // set up ok?