X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=blobdiff_plain;f=pdf%2Fxpdf%2FTextOutputDev.h;h=2c622376f3937b31985da9b08badcdd058173ba7;hb=refs%2Ftags%2FGNOME_2_8_ANCHOR;hp=f13a23a0503d106c01d36ffc7086e2e807e59d8c;hpb=50e9d31c05e9ca11ad43cc570556094782c1b956;p=evince.git diff --git a/pdf/xpdf/TextOutputDev.h b/pdf/xpdf/TextOutputDev.h index f13a23a0..2c622376 100644 --- a/pdf/xpdf/TextOutputDev.h +++ b/pdf/xpdf/TextOutputDev.h @@ -2,14 +2,16 @@ // // TextOutputDev.h // -// Copyright 1997 Derek B. Noonburg +// Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== #ifndef TEXTOUTPUTDEV_H #define TEXTOUTPUTDEV_H -#ifdef __GNUC__ +#include + +#ifdef USE_GCC_PRAGMAS #pragma interface #endif @@ -18,46 +20,324 @@ #include "GfxFont.h" #include "OutputDev.h" -class GfxState; class GString; +class GList; +class GfxFont; +class GfxState; +class UnicodeMap; //------------------------------------------------------------------------ -// TextString + +typedef void (*TextOutputFunc)(void *stream, char *text, int len); + +//------------------------------------------------------------------------ +// TextFontInfo //------------------------------------------------------------------------ -class TextString { +class TextFontInfo { +public: + + TextFontInfo(GfxState *state); + ~TextFontInfo(); + + GBool matches(GfxState *state); + +private: + + GfxFont *gfxFont; +#if TEXTOUT_WORD_LIST + GString *fontName; +#endif + + friend class TextWord; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextWord +//------------------------------------------------------------------------ + +class TextWord { public: // Constructor. - TextString(GfxState *state, GBool hexCodes1); + TextWord(GfxState *state, int rotA, double x0, double y0, + int charPosA, TextFontInfo *fontA, double fontSize); // Destructor. - ~TextString(); + ~TextWord(); - // Add a character to the string. + // Add a character to the word. void addChar(GfxState *state, double x, double y, - double dx, double dy, - Guchar c, GBool useASCII7); + double dx, double dy, Unicode u); + + // Merge onto the end of . + void merge(TextWord *word); + + // Compares to , returning -1 (<), 0 (=), or +1 (>), + // based on a primary-axis comparison, e.g., x ordering if rot=0. + int primaryCmp(TextWord *word); + + // Return the distance along the primary axis between and + // . + double primaryDelta(TextWord *word); + + static int cmpYX(const void *p1, const void *p2); + +#if TEXTOUT_WORD_LIST + int getLength() { return len; } + Unicode getChar(int idx) { return text[idx]; } + GString *getText(); + GString *getFontName() { return font->fontName; } + void getColor(double *r, double *g, double *b) + { *r = colorR; *g = colorG; *b = colorB; } + void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) + { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; } + int getCharPos() { return charPos; } + int getCharLen() { return charLen; } +#endif + +private: + + int rot; // rotation, multiple of 90 degrees + // (0, 1, 2, or 3) + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double base; // baseline x or y coordinate + Unicode *text; // the text + double *edge; // "near" edge x or y coord of each char + // (plus one extra entry for the last char) + int len; // length of text and edge arrays + int size; // size of text and edge arrays + int charPos; // character position (within content stream) + int charLen; // number of content stream characters in + // this word + TextFontInfo *font; // font information + double fontSize; // font size + GBool spaceAfter; // set if there is a space between this + // word and the next word on the line + TextWord *next; // next word in line + +#if TEXTOUT_WORD_LIST + double colorR, // word color + colorG, + colorB; +#endif + + friend class TextPool; + friend class TextLine; + friend class TextBlock; + friend class TextFlow; + friend class TextWordList; + friend class TextPage; +}; - // Add a 16-bit character to the string. - void addChar16(GfxState *state, double x, double y, - double dx, double dy, - int c, GfxFontCharSet16 charSet); +//------------------------------------------------------------------------ +// TextPool +//------------------------------------------------------------------------ + +class TextPool { +public: + + TextPool(); + ~TextPool(); + + TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; } + void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; } + + int getBaseIdx(double base); + + void addWord(TextWord *word); + +private: + + int minBaseIdx; // min baseline bucket index + int maxBaseIdx; // max baseline bucket index + TextWord **pool; // array of linked lists, one for each + // baseline value (multiple of 4 pts) + TextWord *cursor; // pointer to last-accessed word + int cursorBaseIdx; // baseline bucket index of last-accessed word + + friend class TextBlock; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextLine +//------------------------------------------------------------------------ + +class TextLine { +public: + + TextLine(TextBlock *blkA, int rotA, double baseA); + ~TextLine(); + + void addWord(TextWord *word); + + // Return the distance along the primary axis between and + // . + double primaryDelta(TextLine *line); + + // Compares to , returning -1 (<), 0 (=), or +1 (>), + // based on a primary-axis comparison, e.g., x ordering if rot=0. + int primaryCmp(TextLine *line); + + // Compares to , returning -1 (<), 0 (=), or +1 (>), + // based on a secondary-axis comparison of the baselines, e.g., y + // ordering if rot=0. + int secondaryCmp(TextLine *line); + + int cmpYX(TextLine *line); + + static int cmpXY(const void *p1, const void *p2); + + void coalesce(UnicodeMap *uMap); private: + TextBlock *blk; // parent block + int rot; // text rotation double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates + double base; // baseline x or y coordinate + TextWord *words; // words in this line + TextWord *lastWord; // last word in this line + Unicode *text; // Unicode text of the line, including + // spaces between words + double *edge; // "near" edge x or y coord of each char + // (plus one extra entry for the last char) + int *col; // starting column number of each Unicode char + int len; // number of Unicode chars + int convertedLen; // total number of converted characters + GBool hyphenated; // set if last char is a hyphen + TextLine *next; // next line in block + + friend class TextLineFrag; + friend class TextBlock; + friend class TextFlow; + friend class TextWordList; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextBlock +//------------------------------------------------------------------------ + +class TextBlock { +public: + + TextBlock(TextPage *pageA, int rotA); + ~TextBlock(); + + void addWord(TextWord *word); + + void coalesce(UnicodeMap *uMap); + + // Update this block's priMin and priMax values, looking at . + void updatePriMinMax(TextBlock *blk); + + static int cmpXYPrimaryRot(const void *p1, const void *p2); + + static int cmpYXPrimaryRot(const void *p1, const void *p2); + + int primaryCmp(TextBlock *blk); + + double secondaryDelta(TextBlock *blk); + + // Returns true if is below , relative to the page's + // primary rotation. + GBool isBelow(TextBlock *blk); + +private: + + TextPage *page; // the parent page + int rot; // text rotation + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double priMin, priMax; // whitespace bounding box along primary axis + + TextPool *pool; // pool of words (used only until lines + // are built) + TextLine *lines; // linked list of lines + TextLine *curLine; // most recently added line + int nLines; // number of lines + int charCount; // number of characters in the block int col; // starting column - GString *text; // the text - double *xRight; // right-hand x coord of each char - TextString *yxNext; // next string in y-major order - TextString *xyNext; // next string in x-major order - GBool hexCodes; // subsetted font with hex char codes + int nColumns; // number of columns in the block + + TextBlock *next; + TextBlock *stackNext; + + friend class TextLine; + friend class TextLineFrag; + friend class TextFlow; + friend class TextWordList; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextFlow +//------------------------------------------------------------------------ + +class TextFlow { +public: + + TextFlow(TextPage *pageA, TextBlock *blk); + ~TextFlow(); + // Add a block to the end of this flow. + void addBlock(TextBlock *blk); + + // Returns true if fits below in the flow, i.e., (1) + // it uses a font no larger than the last block added to the flow, + // and (2) it fits within the flow's [priMin, priMax] along the + // primary axis. + GBool blockFits(TextBlock *blk, TextBlock *prevBlk); + +private: + + TextPage *page; // the parent page + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double priMin, priMax; // whitespace bounding box along primary axis + TextBlock *blocks; // blocks in flow + TextBlock *lastBlk; // last block in this flow + TextFlow *next; + + friend class TextWordList; friend class TextPage; }; +#if TEXTOUT_WORD_LIST + +//------------------------------------------------------------------------ +// TextWordList +//------------------------------------------------------------------------ + +class TextWordList { +public: + + // Build a flat word list, in content stream order (if + // text->rawOrder is true), physical layout order (if + // is true and text->rawOrder is false), or reading order (if both + // flags are false). + TextWordList(TextPage *text, GBool physLayout); + + ~TextWordList(); + + // Return the number of words on the list. + int getLength(); + + // Return the th word from the list. + TextWord *get(int idx); + +private: + + GList *words; +}; + +#endif // TEXTOUT_WORD_LIST + //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ @@ -66,35 +346,47 @@ class TextPage { public: // Constructor. - TextPage(GBool useASCII7, GBool rawOrder); + TextPage(GBool rawOrderA); // Destructor. ~TextPage(); - // Begin a new string. - void beginString(GfxState *state, GString *s, GBool hex1); + // Start a new page. + void startPage(GfxState *state); + + // End the current page. + void endPage(); + + // Update the current font. + void updateFont(GfxState *state); - // Add a character to the current string. + // Begin a new word. + void beginWord(GfxState *state, double x0, double y0); + + // Add a character to the current word. void addChar(GfxState *state, double x, double y, - double dx, double dy, Guchar c); + double dx, double dy, + CharCode c, Unicode *u, int uLen); - // Add a 16-bit character to the current string. - void addChar16(GfxState *state, double x, double y, - double dx, double dy, int c, - GfxFontCharSet16 charSet); + // End the current word, sorting it into the list of words. + void endWord(); - // End the current string, sorting it into the list of strings. - void endString(); + // Add a word, sorting it into the list of words. + void addWord(TextWord *word); // Coalesce strings that look like parts of the same line. - void coalesce(); - - // Find a string. If is true, starts looking at top of page; - // otherwise starts looking at ,. If is true, - // stops looking at bottom of page; otherwise stops looking at - // ,. If found, sets the text bounding rectange and - // returns true; otherwise returns false. - GBool findText(char *s, GBool top, GBool bottom, + void coalesce(GBool physLayout); + + // Find a string. If is true, starts looking at the + // top of the page; else if is true, starts looking + // immediately after the last find result; else starts looking at + // ,. If is true, stops looking at the + // bottom of the page; else if is true, stops looking + // just before the last find result; else stops looking at + // ,. + GBool findText(Unicode *s, int len, + GBool startAtTop, GBool stopAtBottom, + GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax); @@ -102,22 +394,67 @@ public: GString *getText(double xMin, double yMin, double xMax, double yMax); - // Dump contents of page to a file. - void dump(FILE *f); + // Find a string by character position and length. If found, sets + // the text bounding rectangle and returns true; otherwise returns + // false. + GBool findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax); - // Clear the page. - void clear(); + // Dump contents of page to a file. + void dump(void *outputStream, TextOutputFunc outputFunc, + GBool physLayout); + +#if TEXTOUT_WORD_LIST + // Build a flat word list, in content stream order (if + // this->rawOrder is true), physical layout order (if + // is true and this->rawOrder is false), or reading order (if both + // flags are false). + TextWordList *makeWordList(GBool physLayout); +#endif private: - GBool useASCII7; // use 7-bit ASCII? - GBool rawOrder; // keep strings in content stream order + void clear(); + void assignColumns(TextLineFrag *frags, int nFrags, int rot); + int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GString *s); - TextString *curStr; // currently active string + GBool rawOrder; // keep text in content stream order - TextString *yxStrings; // strings in y-major order - TextString *xyStrings; // strings in x-major order - TextString *yxCur1, *yxCur2; // cursors for yxStrings list + double pageWidth, pageHeight; // width and height of current page + TextWord *curWord; // currently active string + int charPos; // next character position (within content + // stream) + TextFontInfo *curFont; // current font + double curFontSize; // current font size + int nest; // current nesting level (for Type 3 fonts) + int nTinyChars; // number of "tiny" chars seen so far + GBool lastCharOverlap; // set if the last added char overlapped the + // previous char + + TextPool *pools[4]; // a "pool" of TextWords for each rotation + TextFlow *flows; // linked list of flows + TextBlock **blocks; // array of blocks, in yx order + int nBlocks; // number of blocks + int primaryRot; // primary rotation + GBool primaryLR; // primary direction (true means L-to-R, + // false means R-to-L) + TextWord *rawWords; // list of words, in raw order (only if + // rawOrder is set) + TextWord *rawLastWord; // last word on rawWords list + + GList *fonts; // all font info objects used on this + // page [TextFontInfo] + + double lastFindXMin, // coordinates of the last "find" result + lastFindYMin; + GBool haveLastFind; + + friend class TextLine; + friend class TextLineFrag; + friend class TextBlock; + friend class TextFlow; + friend class TextWordList; }; //------------------------------------------------------------------------ @@ -127,13 +464,20 @@ private: class TextOutputDev: public OutputDev { public: - // Open a text output file. If is NULL, no file is written - // (this is useful, e.g., for searching text). If is true, - // text is converted to 7-bit ASCII; otherwise, text is converted to - // 8-bit ISO Latin-1. should also be set for Japanese - // (EUC-JP) text. If is true, the text is kept in content - // stream order. - TextOutputDev(char *fileName, GBool useASCII7, GBool rawOrder); + // Open a text output file. If is NULL, no file is + // written (this is useful, e.g., for searching text). If + // is true, the original physical layout of the text + // is maintained. If is true, the text is kept in + // content stream order. + TextOutputDev(char *fileName, GBool physLayoutA, + GBool rawOrderA, GBool append); + + // Create a TextOutputDev which will write to a generic stream. If + // is true, the original physical layout of the text + // is maintained. If is true, the text is kept in + // content stream order. + TextOutputDev(TextOutputFunc func, void *stream, + GBool physLayoutA, GBool rawOrderA); // Destructor. virtual ~TextOutputDev(); @@ -150,6 +494,13 @@ public: // Does this device use drawChar() or drawString()? virtual GBool useDrawChar() { return gTrue; } + // Does this device use beginType3Char/endType3Char? Otherwise, + // text in Type 3 fonts will be drawn with drawChar/drawString. + virtual GBool interpretType3Chars() { return gFalse; } + + // Does this device need non-text content? + virtual GBool needNonText() { return gFalse; } + //----- initialization and control // Start a page. @@ -165,28 +516,54 @@ public: virtual void beginString(GfxState *state, GString *s); virtual void endString(GfxState *state); virtual void drawChar(GfxState *state, double x, double y, - double dx, double dy, Guchar c); - virtual void drawChar16(GfxState *state, double x, double y, - double dx, double dy, int c); + double dx, double dy, + double originX, double originY, + CharCode c, Unicode *u, int uLen); //----- special access - // Find a string. If is true, starts looking at top of page; - // otherwise starts looking at ,. If is true, - // stops looking at bottom of page; otherwise stops looking at - // ,. If found, sets the text bounding rectange and - // returns true; otherwise returns false. - GBool findText(char *s, GBool top, GBool bottom, + // Find a string. If is true, starts looking at the + // top of the page; else if is true, starts looking + // immediately after the last find result; else starts looking at + // ,. If is true, stops looking at the + // bottom of the page; else if is true, stops looking + // just before the last find result; else stops looking at + // ,. + GBool findText(Unicode *s, int len, + GBool startAtTop, GBool stopAtBottom, + GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax); + // Get the text which is inside the specified rectangle. + GString *getText(double xMin, double yMin, + double xMax, double yMax); + + // Find a string by character position and length. If found, sets + // the text bounding rectangle and returns true; otherwise returns + // false. + GBool findCharRange(int pos, int length, + double *xMin, double *yMin, + double *xMax, double *yMax); + +#if TEXTOUT_WORD_LIST + // Build a flat word list, in content stream order (if + // this->rawOrder is true), physical layout order (if + // this->physLayout is true and this->rawOrder is false), or reading + // order (if both flags are false). + TextWordList *makeWordList(); +#endif + private: - FILE *f; // text file - GBool needClose; // need to close the file? + TextOutputFunc outputFunc; // output function + void *outputStream; // output stream + GBool needClose; // need to close the output file? + // (only if outputStream is a FILE*) TextPage *text; // text for the current page + GBool physLayout; // maintain original physical layout when + // dumping text GBool rawOrder; // keep text in content stream order - GBool hexCodes; // subsetted font with hex char codes GBool ok; // set up ok? };