X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=blobdiff_plain;ds=inline;f=pdf%2Fxpdf%2FTextOutputDev.h;h=2c622376f3937b31985da9b08badcdd058173ba7;hb=refs%2Ftags%2FGNOME_2_8_ANCHOR;hp=f681ecfa8d314135b32c7a27b70e5cc38a9440a0;hpb=64676031423465996e83c4a685290f0c3d97a249;p=evince.git diff --git a/pdf/xpdf/TextOutputDev.h b/pdf/xpdf/TextOutputDev.h index f681ecfa..2c622376 100644 --- a/pdf/xpdf/TextOutputDev.h +++ b/pdf/xpdf/TextOutputDev.h @@ -2,7 +2,7 @@ // // TextOutputDev.h // -// Copyright 1997-2002 Glyph & Cog, LLC +// Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== @@ -20,55 +20,324 @@ #include "GfxFont.h" #include "OutputDev.h" -class GfxState; class GString; -class TextBlock; -class TextLine; - -#undef TEXTOUT_DO_SYMBOLS +class GList; +class GfxFont; +class GfxState; +class UnicodeMap; //------------------------------------------------------------------------ typedef void (*TextOutputFunc)(void *stream, char *text, int len); +//------------------------------------------------------------------------ +// TextFontInfo +//------------------------------------------------------------------------ + +class TextFontInfo { +public: + + TextFontInfo(GfxState *state); + ~TextFontInfo(); + + GBool matches(GfxState *state); + +private: + + GfxFont *gfxFont; +#if TEXTOUT_WORD_LIST + GString *fontName; +#endif + + friend class TextWord; + friend class TextPage; +}; //------------------------------------------------------------------------ -// TextString +// TextWord //------------------------------------------------------------------------ -class TextString { +class TextWord { public: // Constructor. - TextString(GfxState *state, double x0, double y0, - double fontSize); - + TextWord(GfxState *state, int rotA, double x0, double y0, + int charPosA, TextFontInfo *fontA, double fontSize); // Destructor. - ~TextString(); + ~TextWord(); - // Add a character to the string. + // Add a character to the word. void addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u); + // Merge onto the end of . + void merge(TextWord *word); + + // Compares to , returning -1 (<), 0 (=), or +1 (>), + // based on a primary-axis comparison, e.g., x ordering if rot=0. + int primaryCmp(TextWord *word); + + // Return the distance along the primary axis between and + // . + double primaryDelta(TextWord *word); + + static int cmpYX(const void *p1, const void *p2); + +#if TEXTOUT_WORD_LIST + int getLength() { return len; } + Unicode getChar(int idx) { return text[idx]; } + GString *getText(); + GString *getFontName() { return font->fontName; } + void getColor(double *r, double *g, double *b) + { *r = colorR; *g = colorG; *b = colorB; } + void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) + { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; } + int getCharPos() { return charPos; } + int getCharLen() { return charLen; } +#endif + private: + int rot; // rotation, multiple of 90 degrees + // (0, 1, 2, or 3) double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates - union { - GBool marked; // temporary flag used by coalesce() - GBool spaceAfter; // insert a space after this string? - }; + double base; // baseline x or y coordinate Unicode *text; // the text - double *xRight; // right-hand x coord of each char - int len; // length of text and xRight - int size; // size of text and xRight arrays - TextString *next; + double *edge; // "near" edge x or y coord of each char + // (plus one extra entry for the last char) + int len; // length of text and edge arrays + int size; // size of text and edge arrays + int charPos; // character position (within content stream) + int charLen; // number of content stream characters in + // this word + TextFontInfo *font; // font information + double fontSize; // font size + GBool spaceAfter; // set if there is a space between this + // word and the next word on the line + TextWord *next; // next word in line + +#if TEXTOUT_WORD_LIST + double colorR, // word color + colorG, + colorB; +#endif + friend class TextPool; + friend class TextLine; + friend class TextBlock; + friend class TextFlow; + friend class TextWordList; friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextPool +//------------------------------------------------------------------------ + +class TextPool { +public: + + TextPool(); + ~TextPool(); + + TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; } + void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; } + + int getBaseIdx(double base); + + void addWord(TextWord *word); + +private: + + int minBaseIdx; // min baseline bucket index + int maxBaseIdx; // max baseline bucket index + TextWord **pool; // array of linked lists, one for each + // baseline value (multiple of 4 pts) + TextWord *cursor; // pointer to last-accessed word + int cursorBaseIdx; // baseline bucket index of last-accessed word + + friend class TextBlock; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextLine +//------------------------------------------------------------------------ + +class TextLine { +public: + + TextLine(TextBlock *blkA, int rotA, double baseA); + ~TextLine(); + + void addWord(TextWord *word); + + // Return the distance along the primary axis between and + // . + double primaryDelta(TextLine *line); + + // Compares to , returning -1 (<), 0 (=), or +1 (>), + // based on a primary-axis comparison, e.g., x ordering if rot=0. + int primaryCmp(TextLine *line); + + // Compares to , returning -1 (<), 0 (=), or +1 (>), + // based on a secondary-axis comparison of the baselines, e.g., y + // ordering if rot=0. + int secondaryCmp(TextLine *line); + + int cmpYX(TextLine *line); + + static int cmpXY(const void *p1, const void *p2); + + void coalesce(UnicodeMap *uMap); + +private: + + TextBlock *blk; // parent block + int rot; // text rotation + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double base; // baseline x or y coordinate + TextWord *words; // words in this line + TextWord *lastWord; // last word in this line + Unicode *text; // Unicode text of the line, including + // spaces between words + double *edge; // "near" edge x or y coord of each char + // (plus one extra entry for the last char) + int *col; // starting column number of each Unicode char + int len; // number of Unicode chars + int convertedLen; // total number of converted characters + GBool hyphenated; // set if last char is a hyphen + TextLine *next; // next line in block + + friend class TextLineFrag; friend class TextBlock; + friend class TextFlow; + friend class TextWordList; + friend class TextPage; }; +//------------------------------------------------------------------------ +// TextBlock +//------------------------------------------------------------------------ + +class TextBlock { +public: + + TextBlock(TextPage *pageA, int rotA); + ~TextBlock(); + + void addWord(TextWord *word); + + void coalesce(UnicodeMap *uMap); + + // Update this block's priMin and priMax values, looking at . + void updatePriMinMax(TextBlock *blk); + + static int cmpXYPrimaryRot(const void *p1, const void *p2); + + static int cmpYXPrimaryRot(const void *p1, const void *p2); + + int primaryCmp(TextBlock *blk); + + double secondaryDelta(TextBlock *blk); + + // Returns true if is below , relative to the page's + // primary rotation. + GBool isBelow(TextBlock *blk); + +private: + + TextPage *page; // the parent page + int rot; // text rotation + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double priMin, priMax; // whitespace bounding box along primary axis + + TextPool *pool; // pool of words (used only until lines + // are built) + TextLine *lines; // linked list of lines + TextLine *curLine; // most recently added line + int nLines; // number of lines + int charCount; // number of characters in the block + int col; // starting column + int nColumns; // number of columns in the block + + TextBlock *next; + TextBlock *stackNext; + + friend class TextLine; + friend class TextLineFrag; + friend class TextFlow; + friend class TextWordList; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextFlow +//------------------------------------------------------------------------ + +class TextFlow { +public: + + TextFlow(TextPage *pageA, TextBlock *blk); + ~TextFlow(); + + // Add a block to the end of this flow. + void addBlock(TextBlock *blk); + + // Returns true if fits below in the flow, i.e., (1) + // it uses a font no larger than the last block added to the flow, + // and (2) it fits within the flow's [priMin, priMax] along the + // primary axis. + GBool blockFits(TextBlock *blk, TextBlock *prevBlk); + +private: + + TextPage *page; // the parent page + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double priMin, priMax; // whitespace bounding box along primary axis + TextBlock *blocks; // blocks in flow + TextBlock *lastBlk; // last block in this flow + TextFlow *next; + + friend class TextWordList; + friend class TextPage; +}; + +#if TEXTOUT_WORD_LIST + +//------------------------------------------------------------------------ +// TextWordList +//------------------------------------------------------------------------ + +class TextWordList { +public: + + // Build a flat word list, in content stream order (if + // text->rawOrder is true), physical layout order (if + // is true and text->rawOrder is false), or reading order (if both + // flags are false). + TextWordList(TextPage *text, GBool physLayout); + + ~TextWordList(); + + // Return the number of words on the list. + int getLength(); + + // Return the th word from the list. + TextWord *get(int idx); + +private: + + GList *words; +}; + +#endif // TEXTOUT_WORD_LIST + //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ @@ -82,34 +351,42 @@ public: // Destructor. ~TextPage(); + // Start a new page. + void startPage(GfxState *state); + + // End the current page. + void endPage(); + // Update the current font. void updateFont(GfxState *state); + // Begin a new word. + void beginWord(GfxState *state, double x0, double y0); - // Begin a new string. - void beginString(GfxState *state, double x0, double y0); - - // Add a character to the current string. + // Add a character to the current word. void addChar(GfxState *state, double x, double y, - double dx, double dy, Unicode *u, int uLen); - - // End the current string, sorting it into the list of strings. - void endString(); + double dx, double dy, + CharCode c, Unicode *u, int uLen); - // Add a string, sorting it into the list of strings. - void addString(TextString *str); + // End the current word, sorting it into the list of words. + void endWord(); + // Add a word, sorting it into the list of words. + void addWord(TextWord *word); // Coalesce strings that look like parts of the same line. - void coalesce(); - - // Find a string. If is true, starts looking at top of page; - // otherwise starts looking at ,. If is true, - // stops looking at bottom of page; otherwise stops looking at - // ,. If found, sets the text bounding rectange and - // returns true; otherwise returns false. + void coalesce(GBool physLayout); + + // Find a string. If is true, starts looking at the + // top of the page; else if is true, starts looking + // immediately after the last find result; else starts looking at + // ,. If is true, stops looking at the + // bottom of the page; else if is true, stops looking + // just before the last find result; else stops looking at + // ,. GBool findText(Unicode *s, int len, - GBool top, GBool bottom, + GBool startAtTop, GBool stopAtBottom, + GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax); @@ -117,33 +394,67 @@ public: GString *getText(double xMin, double yMin, double xMax, double yMax); - // Dump contents of page to a file. - void dump(void *outputStream, TextOutputFunc outputFunc); + // Find a string by character position and length. If found, sets + // the text bounding rectangle and returns true; otherwise returns + // false. + GBool findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax); - // Clear the page. - void clear(); + // Dump contents of page to a file. + void dump(void *outputStream, TextOutputFunc outputFunc, + GBool physLayout); + +#if TEXTOUT_WORD_LIST + // Build a flat word list, in content stream order (if + // this->rawOrder is true), physical layout order (if + // is true and this->rawOrder is false), or reading order (if both + // flags are false). + TextWordList *makeWordList(GBool physLayout); +#endif private: - GBool xyBefore(TextString *str1, TextString *str2); - GBool xyBefore(TextBlock *blk1, TextBlock *blk2); - GBool yxBefore(TextBlock *blk1, TextBlock *blk2); - double coalesceFit(TextString *str1, TextString *str2); - - GBool rawOrder; // keep strings in content stream order - - TextString *curStr; // currently active string - double fontSize; // current font size + void clear(); + void assignColumns(TextLineFrag *frags, int nFrags, int rot); + int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GString *s); - TextString *xyStrings; // strings in x-major order (before - // they're sorted into lines) - TextString *xyCur1, *xyCur2; // cursors for xyStrings list - TextLine *lines; // list of lines + GBool rawOrder; // keep text in content stream order + double pageWidth, pageHeight; // width and height of current page + TextWord *curWord; // currently active string + int charPos; // next character position (within content + // stream) + TextFontInfo *curFont; // current font + double curFontSize; // current font size int nest; // current nesting level (for Type 3 fonts) - int nTinyChars; // number of "tiny" chars seen so far - + GBool lastCharOverlap; // set if the last added char overlapped the + // previous char + + TextPool *pools[4]; // a "pool" of TextWords for each rotation + TextFlow *flows; // linked list of flows + TextBlock **blocks; // array of blocks, in yx order + int nBlocks; // number of blocks + int primaryRot; // primary rotation + GBool primaryLR; // primary direction (true means L-to-R, + // false means R-to-L) + TextWord *rawWords; // list of words, in raw order (only if + // rawOrder is set) + TextWord *rawLastWord; // last word on rawWords list + + GList *fonts; // all font info objects used on this + // page [TextFontInfo] + + double lastFindXMin, // coordinates of the last "find" result + lastFindYMin; + GBool haveLastFind; + + friend class TextLine; + friend class TextLineFrag; + friend class TextBlock; + friend class TextFlow; + friend class TextWordList; }; //------------------------------------------------------------------------ @@ -155,12 +466,18 @@ public: // Open a text output file. If is NULL, no file is // written (this is useful, e.g., for searching text). If - // is true, the text is kept in content stream order. - TextOutputDev(char *fileName, GBool rawOrderA, GBool append); + // is true, the original physical layout of the text + // is maintained. If is true, the text is kept in + // content stream order. + TextOutputDev(char *fileName, GBool physLayoutA, + GBool rawOrderA, GBool append); // Create a TextOutputDev which will write to a generic stream. If - // is true, the text is kept in content stream order. - TextOutputDev(TextOutputFunc func, void *stream, GBool rawOrderA); + // is true, the original physical layout of the text + // is maintained. If is true, the text is kept in + // content stream order. + TextOutputDev(TextOutputFunc func, void *stream, + GBool physLayoutA, GBool rawOrderA); // Destructor. virtual ~TextOutputDev(); @@ -203,17 +520,18 @@ public: double originX, double originY, CharCode c, Unicode *u, int uLen); - //----- path painting - //----- special access - // Find a string. If is true, starts looking at top of page; - // otherwise starts looking at ,. If is true, - // stops looking at bottom of page; otherwise stops looking at - // ,. If found, sets the text bounding rectange and - // returns true; otherwise returns false. + // Find a string. If is true, starts looking at the + // top of the page; else if is true, starts looking + // immediately after the last find result; else starts looking at + // ,. If is true, stops looking at the + // bottom of the page; else if is true, stops looking + // just before the last find result; else stops looking at + // ,. GBool findText(Unicode *s, int len, - GBool top, GBool bottom, + GBool startAtTop, GBool stopAtBottom, + GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax); @@ -221,6 +539,21 @@ public: GString *getText(double xMin, double yMin, double xMax, double yMax); + // Find a string by character position and length. If found, sets + // the text bounding rectangle and returns true; otherwise returns + // false. + GBool findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax); + +#if TEXTOUT_WORD_LIST + // Build a flat word list, in content stream order (if + // this->rawOrder is true), physical layout order (if + // this->physLayout is true and this->rawOrder is false), or reading + // order (if both flags are false). + TextWordList *makeWordList(); +#endif + private: TextOutputFunc outputFunc; // output function @@ -228,9 +561,10 @@ private: GBool needClose; // need to close the output file? // (only if outputStream is a FILE*) TextPage *text; // text for the current page + GBool physLayout; // maintain original physical layout when + // dumping text GBool rawOrder; // keep text in content stream order GBool ok; // set up ok? - }; #endif