X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=blobdiff_plain;f=pdf%2Fxpdf%2FTextOutputDev.cc;h=3fcc9ec294ecdbd1e7800c98764c0928e04ca6c8;hb=884f739665dc56e66f51e104350f2affd33f2dd8;hp=6d675ef40663e9b9c3eea1da46fae506f10577fc;hpb=50e9d31c05e9ca11ad43cc570556094782c1b956;p=evince.git

diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc
index 6d675ef4..3fcc9ec2 100644
--- a/pdf/xpdf/TextOutputDev.cc
+++ b/pdf/xpdf/TextOutputDev.cc
@@ -2,920 +2,2172 @@
 //
 // TextOutputDev.cc
 //
-// Copyright 1997 Derek B. Noonburg
+// Copyright 1997-2003 Glyph & Cog, LLC
 //
 //========================================================================
 
-#ifdef __GNUC__
+#include <aconf.h>
+
+#ifdef USE_GCC_PRAGMAS
 #pragma implementation
 #endif
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
+#include <math.h>
 #include <ctype.h>
-#include "GString.h"
+#ifdef WIN32
+#include <fcntl.h> // for O_BINARY
+#include <io.h>    // for setmode
+#endif
 #include "gmem.h"
+#include "GString.h"
+#include "GList.h"
 #include "config.h"
 #include "Error.h"
+#include "GlobalParams.h"
+#include "UnicodeMap.h"
 #include "GfxState.h"
-#include "FontEncoding.h"
 #include "TextOutputDev.h"
 
-#include "TextOutputFontInfo.h"
+#ifdef MACOS
+// needed for setting type/creator of MacOS files
+#include "ICSupport.h"
+#endif
+
+//------------------------------------------------------------------------
+// parameters
+//------------------------------------------------------------------------
+
+// Minium and maximum inter-word spacing (as a fraction of the average
+// character width).
+#define wordMinSpaceWidth 0.3
+#define wordMaxSpaceWidth 2.0
+
+// Default min and max inter-word spacing (when the average character
+// width is unknown).
+#define wordDefMinSpaceWidth 0.2
+#define wordDefMaxSpaceWidth 1.5
+
+// Max difference in x,y coordinates (as a fraction of the font size)
+// allowed for duplicated text (fake boldface, drop shadows) which is
+// to be discarded.
+#define dupMaxDeltaX 0.1
+#define dupMaxDeltaY 0.2
+
+// Min overlap (as a fraction of the font size) required for two
+// lines to be considered vertically overlapping.
+#define lineOverlapSlack 0.5
+
+// Max difference in baseline y coordinates (as a fraction of the font
+// size) allowed for words which are to be grouped into a line, not
+// including sub/superscripts.
+#define lineMaxBaselineDelta 0.1
+
+// Max ratio of font sizes allowed for words which are to be grouped
+// into a line, not including sub/superscripts.
+#define lineMaxFontSizeRatio 1.4
+
+// Min spacing (as a fraction of the font size) allowed between words
+// which are to be grouped into a line.
+#define lineMinDeltaX -0.5
+
+// Minimum vertical overlap (as a fraction of the font size) required
+// for superscript and subscript words.
+#define lineMinSuperscriptOverlap 0.3
+#define lineMinSubscriptOverlap   0.3
+
+// Min/max ratio of font sizes allowed for sub/superscripts compared to
+// the base text.
+#define lineMinSubscriptFontSizeRatio   0.4
+#define lineMaxSubscriptFontSizeRatio   1.01
+#define lineMinSuperscriptFontSizeRatio 0.4
+#define lineMaxSuperscriptFontSizeRatio 1.01
+
+// Max horizontal spacing (as a fraction of the font size) allowed
+// before sub/superscripts.
+#define lineMaxSubscriptDeltaX   0.2
+#define lineMaxSuperscriptDeltaX 0.2
+
+// Maximum vertical spacing (as a fraction of the font size) allowed
+// for lines which are to be grouped into a block.
+#define blkMaxSpacing 2.0
+
+// Max ratio of primary font sizes allowed for lines which are to be
+// grouped into a block.
+#define blkMaxFontSizeRatio 1.3
+
+// Min overlap (as a fraction of the font size) required for two
+// blocks to be considered vertically overlapping.
+#define blkOverlapSlack 0.5
+
+// Max vertical spacing (as a fraction of the font size) allowed
+// between blocks which are 'adjacent' when sorted by reading order.
+#define blkMaxSortSpacing 2.0
+
+// Max vertical offset (as a fraction of the font size) of the top and
+// bottom edges allowed for blocks which are to be grouped into a
+// flow.
+#define flowMaxDeltaY 1.0
+
+//------------------------------------------------------------------------
+// TextFontInfo
+//------------------------------------------------------------------------
+
+TextFontInfo::TextFontInfo(GfxState *state) {
+  double *textMat;
+  double t1, t2, avgWidth, w;
+  int n, i;
+
+  gfxFont = state->getFont();
+  textMat = state->getTextMat();
+  horizScaling = state->getHorizScaling();
+  if ((t1 = fabs(textMat[0])) > 0.01 &&
+      (t2 = fabs(textMat[3])) > 0.01) {
+    horizScaling *= t1 / t2;
+  }
+
+  minSpaceWidth = horizScaling * wordDefMinSpaceWidth;
+  maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth;
+  if (gfxFont && gfxFont->isCIDFont()) {
+    //~ handle 16-bit fonts
+  } else if (gfxFont && gfxFont->getType() != fontType3) {
+    avgWidth = 0;
+    n = 0;
+    for (i = 0; i < 256; ++i) {
+      w = ((Gfx8BitFont *)gfxFont)->getWidth(i);
+      if (w > 0) {
+	avgWidth += w;
+	++n;
+      }
+    }
+    if (n > 0) {
+      avgWidth /= n;
+      minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth;
+      maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth;
+    }
+  }
+
+}
+
+TextFontInfo::~TextFontInfo() {
+}
+
+GBool TextFontInfo::matches(GfxState *state) {
+  double *textMat;
+  double t1, t2, h;
+
+  textMat = state->getTextMat();
+  h = state->getHorizScaling();
+  if ((t1 = fabs(textMat[0])) > 0.01 &&
+      (t2 = fabs(textMat[3])) > 0.01) {
+    h *= t1 / t2;
+  }
+  return state->getFont() == gfxFont &&
+         fabs(h - horizScaling) < 0.01;
+}
+
+//------------------------------------------------------------------------
+// TextWord
+//------------------------------------------------------------------------
+
+TextWord::TextWord(GfxState *state, double x0, double y0, int charPosA,
+		   TextFontInfo *fontA, double fontSizeA) {
+  GfxFont *gfxFont;
+  double x, y;
+
+  charPos = charPosA;
+  charLen = 0;
+  font = fontA;
+  fontSize = fontSizeA;
+  state->transform(x0, y0, &x, &y);
+  if ((gfxFont = font->gfxFont)) {
+    yMin = y - gfxFont->getAscent() * fontSize;
+    yMax = y - gfxFont->getDescent() * fontSize;
+  } else {
+    // this means that the PDF file draws text without a current font,
+    // which should never happen
+    yMin = y - 0.95 * fontSize;
+    yMax = y + 0.35 * fontSize;
+  }
+  if (yMin == yMax) {
+    // this is a sanity check for a case that shouldn't happen -- but
+    // if it does happen, we want to avoid dividing by zero later
+    yMin = y;
+    yMax = y + 1;
+  }
+  yBase = y;
+  text = NULL;
+  xRight = NULL;
+  len = size = 0;
+  spaceAfter = gFalse;
+  next = NULL;
+
+}
+
+
+TextWord::~TextWord() {
+  gfree(text);
+  gfree(xRight);
+}
+
+void TextWord::addChar(GfxState *state, double x, double y,
+		       double dx, double dy, Unicode u) {
+  if (len == size) {
+    size += 16;
+    text = (Unicode *)grealloc(text, size * sizeof(Unicode));
+    xRight = (double *)grealloc(xRight, size * sizeof(double));
+  }
+  text[len] = u;
+  if (len == 0) {
+    xMin = x;
+  }
+  xMax = xRight[len] = x + dx;
+  ++len;
+}
+
+// Returns true if <this> comes before <word2> in xy order.
+GBool TextWord::xyBefore(TextWord *word2) {
+  return xMin < word2->xMin ||
+	 (xMin == word2->xMin && yMin < word2->yMin);
+}
+
+// Merge another word onto the end of this one.
+void TextWord::merge(TextWord *word2) {
+  int i;
+
+  xMax = word2->xMax;
+  if (word2->yMin < yMin) {
+    yMin = word2->yMin;
+  }
+  if (word2->yMax > yMax) {
+    yMax = word2->yMax;
+  }
+  if (len + word2->len > size) {
+    size = len + word2->len;
+    text = (Unicode *)grealloc(text, size * sizeof(Unicode));
+    xRight = (double *)grealloc(xRight, size * sizeof(double));
+  }
+  for (i = 0; i < word2->len; ++i) {
+    text[len + i] = word2->text[i];
+    xRight[len + i] = word2->xRight[i];
+  }
+  len += word2->len;
+  charLen += word2->charLen;
+}
+
+//------------------------------------------------------------------------
+// TextLine
+//------------------------------------------------------------------------
+
+TextLine::TextLine() {
+  words = NULL;
+  text = NULL;
+  xRight = NULL;
+  col = NULL;
+  len = 0;
+  hyphenated = gFalse;
+  pageNext = NULL;
+  next = NULL;
+  flowNext = NULL;
+}
+
+TextLine::~TextLine() {
+  TextWord *w1, *w2;
+
+  for (w1 = words; w1; w1 = w2) {
+    w2 = w1->next;
+    delete w1;
+  }
+  gfree(text);
+  gfree(xRight);
+  gfree(col);
+}
+
+// Returns true if <this> comes before <line2> in yx order, allowing
+// slack for vertically overlapping lines.
+GBool TextLine::yxBefore(TextLine *line2) {
+  double dy;
+
+  dy = lineOverlapSlack * fontSize;
+
+  // non-overlapping case
+  if (line2->yMin > yMax - dy ||
+      line2->yMax < yMin + dy) {
+    return yMin < line2->yMin ||
+           (yMin == line2->yMin && xMin < line2->xMin);
+  }
+
+  // overlapping case
+  return xMin < line2->xMin;
+}
+
+// Merge another line's words onto the end of this line.
+void TextLine::merge(TextLine *line2) {
+  int newLen, i;
+
+  xMax = line2->xMax;
+  if (line2->yMin < yMin) {
+    yMin = line2->yMin;
+  }
+  if (line2->yMax > yMax) {
+    yMax = line2->yMax;
+  }
+  xSpaceR = line2->xSpaceR;
+  lastWord->spaceAfter = gTrue;
+  lastWord->next = line2->words;
+  lastWord = line2->lastWord;
+  line2->words = NULL;
+  newLen = len + 1 + line2->len;
+  text = (Unicode *)grealloc(text, newLen * sizeof(Unicode));
+  xRight = (double *)grealloc(xRight, newLen * sizeof(double));
+  text[len] = (Unicode)0x0020;
+  xRight[len] = line2->xMin;
+  for (i = 0; i < line2->len; ++i) {
+    text[len + 1 + i] = line2->text[i];
+    xRight[len + 1 + i] = line2->xRight[i];
+  }
+  len = newLen;
+  convertedLen += line2->convertedLen;
+  hyphenated = line2->hyphenated;
+}
+
+//------------------------------------------------------------------------
+// TextBlock
+//------------------------------------------------------------------------
+
+TextBlock::TextBlock() {
+  lines = NULL;
+  next = NULL;
+}
+
+TextBlock::~TextBlock() {
+  TextLine *l1, *l2;
+
+  for (l1 = lines; l1; l1 = l2) {
+    l2 = l1->next;
+    delete l1;
+  }
+}
+
+// Returns true if <this> comes before <blk2> in xy order, allowing
+// slack for vertically overlapping blocks.
+GBool TextBlock::yxBefore(TextBlock *blk2) {
+  double dy;
+
+  dy = blkOverlapSlack * lines->fontSize;
+
+  // non-overlapping case
+  if (blk2->yMin > yMax - dy ||
+      blk2->yMax < yMin + dy) {
+    return yMin < blk2->yMin ||
+           (yMin == blk2->yMin && xMin < blk2->xMin);
+  }
+
+  // overlapping case
+  return xMin < blk2->xMin;
+}
+
+// Merge another block's line onto the right of this one.
+void TextBlock::mergeRight(TextBlock *blk2) {
+  lines->merge(blk2->lines);
+  xMax = lines->xMax;
+  yMin = lines->yMin;
+  yMax = lines->yMax;
+  xSpaceR = lines->xSpaceR;
+}
+
+// Merge another block's lines onto the bottom of this block.
+void TextBlock::mergeBelow(TextBlock *blk2) {
+  TextLine *line;
+
+  if (blk2->xMin < xMin) {
+    xMin = blk2->xMin;
+  }
+  if (blk2->xMax > xMax) {
+    xMax = blk2->xMax;
+  }
+  yMax = blk2->yMax;
+  if (blk2->xSpaceL > xSpaceL) {
+    xSpaceL = blk2->xSpaceL;
+  }
+  if (blk2->xSpaceR < xSpaceR) {
+    xSpaceR = blk2->xSpaceR;
+  }
+  if (blk2->maxFontSize > maxFontSize) {
+    maxFontSize = blk2->maxFontSize;
+  }
+  for (line = lines; line->next; line = line->next) ;
+  line->next = line->flowNext = blk2->lines;
+  blk2->lines = NULL;
+}
+
+//------------------------------------------------------------------------
+// TextFlow
+//------------------------------------------------------------------------
+
+TextFlow::TextFlow() {
+  blocks = NULL;
+  next = NULL;
+}
+
+TextFlow::~TextFlow() {
+  TextBlock *b1, *b2;
+
+  for (b1 = blocks; b1; b1 = b2) {
+    b2 = b1->next;
+    delete b1;
+  }
+}
+
+
+//------------------------------------------------------------------------
+// TextPage
+//------------------------------------------------------------------------
+
+TextPage::TextPage(GBool rawOrderA) {
+  rawOrder = rawOrderA;
+  curWord = NULL;
+  charPos = 0;
+  font = NULL;
+  fontSize = 0;
+  nest = 0;
+  nTinyChars = 0;
+  words = wordPtr = NULL;
+  lines = NULL;
+  flows = NULL;
+  fonts = new GList();
+}
+
+TextPage::~TextPage() {
+  clear();
+  delete fonts;
+}
+
+void TextPage::updateFont(GfxState *state) {
+  GfxFont *gfxFont;
+  double *fm;
+  char *name;
+  int code, mCode, letterCode, anyCode;
+  double w;
+  int i;
+
+  // get the font info object
+  font = NULL;
+  for (i = 0; i < fonts->getLength(); ++i) {
+    font = (TextFontInfo *)fonts->get(i);
+    if (font->matches(state)) {
+      break;
+    }
+    font = NULL;
+  }
+  if (!font) {
+    font = new TextFontInfo(state);
+    fonts->append(font);
+  }
+
+  // adjust the font size
+  gfxFont = state->getFont();
+  fontSize = state->getTransformedFontSize();
+  if (gfxFont && gfxFont->getType() == fontType3) {
+    // This is a hack which makes it possible to deal with some Type 3
+    // fonts.  The problem is that it's impossible to know what the
+    // base coordinate system used in the font is without actually
+    // rendering the font.  This code tries to guess by looking at the
+    // width of the character 'm' (which breaks if the font is a
+    // subset that doesn't contain 'm').
+    mCode = letterCode = anyCode = -1;
+    for (code = 0; code < 256; ++code) {
+      name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
+      if (name && name[0] == 'm' && name[1] == '\0') {
+	mCode = code;
+      }
+      if (letterCode < 0 && name && name[1] == '\0' &&
+	  ((name[0] >= 'A' && name[0] <= 'Z') ||
+	   (name[0] >= 'a' && name[0] <= 'z'))) {
+	letterCode = code;
+      }
+      if (anyCode < 0 && name &&
+	  ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
+	anyCode = code;
+      }
+    }
+    if (mCode >= 0 &&
+	(w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
+      // 0.6 is a generic average 'm' width -- yes, this is a hack
+      fontSize *= w / 0.6;
+    } else if (letterCode >= 0 &&
+	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
+      // even more of a hack: 0.5 is a generic letter width
+      fontSize *= w / 0.5;
+    } else if (anyCode >= 0 &&
+	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
+      // better than nothing: 0.5 is a generic character width
+      fontSize *= w / 0.5;
+    }
+    fm = gfxFont->getFontMatrix();
+    if (fm[0] != 0) {
+      fontSize *= fabs(fm[3] / fm[0]);
+    }
+  }
+}
+
+void TextPage::beginWord(GfxState *state, double x0, double y0) {
+  // This check is needed because Type 3 characters can contain
+  // text-drawing operations (when TextPage is being used via
+  // XOutputDev rather than TextOutputDev).
+  if (curWord) {
+    ++nest;
+    return;
+  }
+
+  curWord = new TextWord(state, x0, y0, charPos, font, fontSize);
+}
+
+void TextPage::addChar(GfxState *state, double x, double y,
+		       double dx, double dy,
+		       CharCode c, Unicode *u, int uLen) {
+  double x1, y1, w1, h1, dx2, dy2, sp;
+  int n, i;
+
+  // if the previous char was a space, addChar will have called
+  // endWord, so we need to start a new word
+  if (!curWord) {
+    beginWord(state, x, y);
+  }
+
+  // throw away chars that aren't inside the page bounds
+  state->transform(x, y, &x1, &y1);
+  if (x1 < 0 || x1 > pageWidth ||
+      y1 < 0 || y1 > pageHeight) {
+    return;
+  }
+
+  // subtract char and word spacing from the dx,dy values
+  sp = state->getCharSpace();
+  if (c == (CharCode)0x20) {
+    sp += state->getWordSpace();
+  }
+  state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
+  dx -= dx2;
+  dy -= dy2;
+  state->transformDelta(dx, dy, &w1, &h1);
+
+  // check the tiny chars limit
+  if (!globalParams->getTextKeepTinyChars() &&
+      fabs(w1) < 3 && fabs(h1) < 3) {
+    if (++nTinyChars > 20000) {
+      return;
+    }
+  }
+
+  // break words at space character
+  if (uLen == 1 && u[0] == (Unicode)0x20) {
+    ++curWord->charLen;
+    ++charPos;
+    endWord();
+    return;
+  }
+
+  // large char spacing is sometimes used to move text around -- in
+  // this case, break text into individual chars and let the coalesce
+  // function deal with it later
+  n = curWord->len;
+  if (n > 0 && x1 - curWord->xRight[n-1] >
+                    curWord->font->minSpaceWidth * curWord->fontSize) {
+    endWord();
+    beginWord(state, x, y);
+  }
+
+  // page rotation and/or transform matrices can cause text to be
+  // drawn in reverse order -- in this case, swap the begin/end
+  // coordinates and break text into individual chars
+  if (w1 < 0) {
+    endWord();
+    beginWord(state, x + dx, y + dy);
+    x1 += w1;
+    y1 += h1;
+    w1 = -w1;
+    h1 = -h1;
+  }
+
+  // add the characters to the current word
+  if (uLen != 0) {
+    w1 /= uLen;
+    h1 /= uLen;
+  }
+  for (i = 0; i < uLen; ++i) {
+    curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
+  }
+  ++curWord->charLen;
+  ++charPos;
+}
+
+void TextPage::endWord() {
+  // This check is needed because Type 3 characters can contain
+  // text-drawing operations (when TextPage is being used via
+  // XOutputDev rather than TextOutputDev).
+  if (nest > 0) {
+    --nest;
+    return;
+  }
+
+  if (curWord) {
+    addWord(curWord);
+    curWord = NULL;
+  }
+}
+
+void TextPage::addWord(TextWord *word) {
+  TextWord *p1, *p2;
+
+  // throw away zero-length words -- they don't have valid xMin/xMax
+  // values, and they're useless anyway
+  if (word->len == 0) {
+    delete word;
+    return;
+  }
+
+  // insert word in xy list
+  if (rawOrder) {
+    p1 = wordPtr;
+    p2 = NULL;
+  } else {
+    if (wordPtr && wordPtr->xyBefore(word)) {
+      p1 = wordPtr;
+      p2 = wordPtr->next;
+    } else {
+      p1 = NULL;
+      p2 = words;
+    }
+    for (; p2; p1 = p2, p2 = p2->next) {
+      if (word->xyBefore(p2)) {
+	break;
+      }
+    }
+  }
+  if (p1) {
+    p1->next = word;
+  } else {
+    words = word;
+  }
+  word->next = p2;
+  wordPtr = word;
+}
+
+void TextPage::coalesce(GBool physLayout) {
+  TextWord *word0, *word1, *word2;
+  TextLine *line0, *line1, *line2, *line3, *line4, *lineList;
+  TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6;
+  TextBlock *yxBlocks, *blocks, *blkStack;
+  TextFlow *flow0, *flow1;
+  double sz, xLimit, yLimit;
+  double fit1, fit2, sp1, sp2;
+  GBool found;
+  UnicodeMap *uMap;
+  GBool isUnicode;
+  char buf[8];
+  int col1, col2, d, i, j;
+
+#if 0 // for debugging
+  printf("*** initial word list ***\n");
+  for (word0 = words; word0; word0 = word0->next) {
+    printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+	   word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+    for (i = 0; i < word0->len; ++i) {
+      fputc(word0->text[i] & 0xff, stdout);
+    }
+    printf("'\n");
+  }
+  printf("\n");
+  fflush(stdout);
+#endif
+
+  //----- discard duplicated text (fake boldface, drop shadows)
+
+  word0 = words;
+  while (word0) {
+    sz = word0->fontSize;
+    xLimit = word0->xMin + sz * dupMaxDeltaX;
+    found = gFalse;
+    for (word1 = word0, word2 = word0->next;
+	 word2 && word2->xMin < xLimit;
+	 word1 = word2, word2 = word2->next) {
+      if (word2->len == word0->len &&
+	  !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) &&
+	  fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY &&
+	  fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY &&
+	  fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) {
+	found = gTrue;
+	break;
+      }
+    }
+    if (found) {
+      word1->next = word2->next;
+      delete word2;
+    } else {
+      word0 = word0->next;
+    }
+  }
+
+#if 0 // for debugging
+  printf("*** words after removing duplicate text ***\n");
+  for (word0 = words; word0; word0 = word0->next) {
+    printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+	   word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+    for (i = 0; i < word0->len; ++i) {
+      fputc(word0->text[i] & 0xff, stdout);
+    }
+    printf("'\n");
+  }
+  printf("\n");
+  fflush(stdout);
+#endif
+
+  //----- merge words
+
+  word0 = words;
+  while (word0) {
+    sz = word0->fontSize;
+
+    // look for adjacent text which is part of the same word, and
+    // merge it into this word
+    xLimit = word0->xMax + sz * word0->font->minSpaceWidth;
+    if (rawOrder) {
+      word1 = word0;
+      word2 = word0->next;
+      found = word2 &&
+	      word2->xMin < xLimit &&
+	      word2->font == word0->font &&
+	      fabs(word2->fontSize - sz) < 0.05 &&
+	      fabs(word2->yBase - word0->yBase) < 0.05 &&
+	      word2->charPos == word0->charPos + word0->charLen;
+    } else {
+      found = gFalse;
+      for (word1 = word0, word2 = word0->next;
+	   word2 && word2->xMin < xLimit;
+	   word1 = word2, word2 = word2->next) {
+	if (word2->font == word0->font &&
+	    fabs(word2->fontSize - sz) < 0.05 &&
+	    fabs(word2->yBase - word0->yBase) < 0.05 &&
+	    word2->charPos == word0->charPos + word0->charLen) {
+	  found = gTrue;
+	  break;
+	}
+      }
+    }
+    if (found) {
+      word0->merge(word2);
+      word1->next = word2->next;
+      delete word2;
+      continue;
+    }
+
+    word0 = word0->next;
+  }
+
+#if 0 // for debugging
+  printf("*** after merging words ***\n");
+  for (word0 = words; word0; word0 = word0->next) {
+    printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+	   word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+    for (i = 0; i < word0->len; ++i) {
+      fputc(word0->text[i] & 0xff, stdout);
+    }
+    printf("'\n");
+  }
+  printf("\n");
+  fflush(stdout);
+#endif
+
+  //----- assemble words into lines
+
+  lineList = line0 = NULL;
+  while (words) {
+
+    // remove the first word from the word list
+    word0 = words;
+    words = words->next;
+    word0->next = NULL;
+
+    // find the best line (if any) for the word
+    if (rawOrder) {
+      if (line0 && lineFit(line0, word0, &sp2) >= 0) {
+	line1 = line0;
+	sp1 = sp2;
+      } else {
+	line1 = NULL;
+	sp1 = 0;
+      }
+    } else {
+      line1 = NULL;
+      fit1 = 0;
+      sp1 = 0;
+      for (line2 = lineList; line2; line2 = line2->next) {
+	fit2 = lineFit(line2, word0, &sp2);
+	if (fit2 >= 0 && (!line1 || fit2 < fit1)) {
+	  line1 = line2;
+	  fit1 = fit2;
+	  sp1 = sp2;
+	}
+      }
+    }
+
+    // found a line: append the word
+    if (line1) {
+      word1 = line1->lastWord;
+      word1->next = word0;
+      line1->lastWord = word0;
+      if (word0->xMax > line1->xMax) {
+	line1->xMax = word0->xMax;
+      }
+      if (word0->yMin < line1->yMin) {
+	line1->yMin = word0->yMin;
+      }
+      if (word0->yMax > line1->yMax) {
+	line1->yMax = word0->yMax;
+      }
+      line1->len += word0->len;
+      if (sp1 > line1->fontSize * line1->font->minSpaceWidth) {
+	word1->spaceAfter = gTrue;
+	++line1->len;
+      }
+
+    // didn't find a line: create a new line
+    } else {
+      line1 = new TextLine();
+      line1->words = line1->lastWord = word0;
+      line1->xMin = word0->xMin;
+      line1->xMax = word0->xMax;
+      line1->yMin = word0->yMin;
+      line1->yMax = word0->yMax;
+      line1->yBase = word0->yBase;
+      line1->font = word0->font;
+      line1->fontSize = word0->fontSize;
+      line1->len = word0->len;
+      if (line0) {
+	line0->next = line1;
+      } else {
+	lineList = line1;
+      }
+      line0 = line1;
+    }
+  }
+
+  // build the line text
+  uMap = globalParams->getTextEncoding();
+  isUnicode = uMap ? uMap->isUnicode() : gFalse;
+
+  for (line1 = lineList; line1; line1 = line1->next) {
+    line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode));
+    line1->xRight = (double *)gmalloc(line1->len * sizeof(double));
+    line1->col = (int *)gmalloc(line1->len * sizeof(int));
+    i = 0;
+    for (word1 = line1->words; word1; word1 = word1->next) {
+      for (j = 0; j < word1->len; ++j) {
+	line1->text[i] = word1->text[j];
+	line1->xRight[i] = word1->xRight[j];
+	++i;
+      }
+      if (word1->spaceAfter && word1->next) {
+	line1->text[i] = (Unicode)0x0020;
+	line1->xRight[i] = word1->next->xMin;
+	++i;
+      }
+    }
+    line1->convertedLen = 0;
+    for (j = 0; j < line1->len; ++j) {
+      line1->col[j] = line1->convertedLen;
+      if (isUnicode) {
+	++line1->convertedLen;
+      } else if (uMap) {
+	line1->convertedLen +=
+	  uMap->mapUnicode(line1->text[j], buf, sizeof(buf));
+      }
+    }
+
+    // check for hyphen at end of line
+    //~ need to check for other chars used as hyphens
+    if (line1->text[line1->len - 1] == (Unicode)'-') {
+      line1->hyphenated = gTrue;
+    }
+
+  }
+
+  if (uMap) {
+    uMap->decRefCnt();
+  }
+
+#if 0 // for debugging
+  printf("*** lines in xy order ***\n");
+  for (line0 = lineList; line0; line0 = line0->next) {
+    printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+	   line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+	   line0->yBase, line0->len);
+    for (word0 = line0->words; word0; word0 = word0->next) {
+      printf("  word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+	     word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+	     word0->yBase, word0->fontSize, word0->spaceAfter);
+      for (i = 0; i < word0->len; ++i) {
+	fputc(word0->text[i] & 0xff, stdout);
+      }
+      printf("'\n");
+    }
+  }
+  printf("\n");
+  fflush(stdout);
+#endif
+
+  //----- column assignment
+
+  for (line1 = lineList; line1; line1 = line1->next) {
+    col1 = 0;
+    for (line2 = lineList; line2 != line1; line2 = line2->next) {
+      if (line1->xMin >= line2->xMax) {
+	d = (int)((line1->xMin - line2->xMax) /
+		  (line1->font->maxSpaceWidth * line1->fontSize));
+	if (d > 4) {
+	  d = 4;
+	}
+	col2 = line2->col[0] + line2->convertedLen + d;
+	if (col2 > col1) {
+	  col1 = col2;
+	}
+      } else if (line1->xMin > line2->xMin) {
+	for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ;
+	col2 = line2->col[i];
+	if (col2 > col1) {
+	  col1 = col2;
+	}
+      }
+    }
+    for (j = 0; j < line1->len; ++j) {
+      line1->col[j] += col1;
+    }
+  }
+
+#if 0 // for debugging
+  printf("*** lines in xy order, after column assignment ***\n");
+  for (line0 = lineList; line0; line0 = line0->next) {
+    printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f col=%d len=%d]\n",
+	   line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+	   line0->yBase, line0->col[0], line0->len);
+    for (word0 = line0->words; word0; word0 = word0->next) {
+      printf("  word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+	     word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+	     word0->yBase, word0->fontSize, word0->spaceAfter);
+      for (i = 0; i < word0->len; ++i) {
+	fputc(word0->text[i] & 0xff, stdout);
+      }
+      printf("'\n");
+    }
+  }
+  printf("\n");
+  fflush(stdout);
+#endif
+
+  //----- assemble lines into blocks
+
+  if (rawOrder) {
+
+    lines = lineList;
+    for (line1 = lines; line1; line1 = line1->next) {
+      line1->xSpaceL = 0;
+      line1->xSpaceR = pageWidth;
+    }
+
+  } else {
+
+    // sort lines into yx order
+    lines = NULL;
+    while (lineList) {
+      line0 = lineList;
+      lineList = lineList->next;
+      for (line1 = NULL, line2 = lines;
+	   line2 && !line0->yxBefore(line2);
+	   line1 = line2, line2 = line2->next) ;
+      if (line1) {
+	line1->next = line0;
+      } else {
+	lines = line0;
+      }
+      line0->next = line2;
+    }
+
+    // compute whitespace to left and right of each line
+    line0 = lines;
+    for (line1 = lines; line1; line1 = line1->next) {
+
+      // find the first vertically overlapping line
+      for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ;
+
+      // check each vertically overlapping line -- look for the nearest
+      // on each side
+      line1->xSpaceL = 0;
+      line1->xSpaceR = pageWidth;
+      for (line2 = line0;
+	   line2 && line2->yMin < line1->yMax;
+	   line2 = line2->next) {
+	if (line2->yMax > line1->yMin) {
+	  if (line2->xMax < line1->xMin) {
+	    if (line2->xMax > line1->xSpaceL) {
+	      line1->xSpaceL = line2->xMax;
+	    }
+	  } else if (line2->xMin > line1->xMax) {
+	    if (line2->xMin < line1->xSpaceR) {
+	      line1->xSpaceR = line2->xMin;
+	    }
+	  }
+	}
+      }
+    }
+  } // (!rawOrder)
+
+#if 0 // for debugging
+  printf("*** lines in yx order ***\n");
+  for (line0 = lines; line0; line0 = line0->next) {
+    printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n",
+	   line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+	   line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len);
+    for (word0 = line0->words; word0; word0 = word0->next) {
+      printf("  word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+	     word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+	     word0->yBase, word0->fontSize, word0->spaceAfter);
+      for (i = 0; i < word0->len; ++i) {
+	fputc(word0->text[i] & 0xff, stdout);
+      }
+      printf("'\n");
+    }
+  }
+  printf("\n");
+  fflush(stdout);
+#endif
+
+  lineList = lines;
+  yxBlocks = NULL;
+  blk0 = NULL;
+  while (lineList) {
+
+    // build a new block object
+    line0 = lineList;
+    lineList = lineList->next;
+    line0->next = NULL;
+    blk1 = new TextBlock();
+    blk1->lines = line0;
+    blk1->xMin = line0->xMin;
+    blk1->xMax = line0->xMax;
+    blk1->yMin = line0->yMin;
+    blk1->yMax = line0->yMax;
+    blk1->xSpaceL = line0->xSpaceL;
+    blk1->xSpaceR = line0->xSpaceR;
+    blk1->maxFontSize = line0->fontSize;
+
+    // find subsequent lines in the block
+    while (lineList) {
+
+      // look for the first horizontally overlapping line below this
+      // one
+      yLimit = line0->yMax + blkMaxSpacing * line0->fontSize;
+      line3 = line4 = NULL;
+      if (rawOrder) {
+	if (lineList->yMin < yLimit &&
+	    lineList->xMax > blk1->xMin &&
+	    lineList->xMin < blk1->xMax) {
+	  line3 = NULL;
+	  line4 = lineList;
+	}
+      } else {
+	for (line1 = NULL, line2 = lineList;
+	     line2 && line2->yMin < yLimit;
+	     line1 = line2, line2 = line2->next) {
+	  if (line2->xMax > blk1->xMin &&
+	      line2->xMin < blk1->xMax) {
+	    line3 = line1;
+	    line4 = line2;
+	    break;
+	  }
+	}
+      }
+
+      // if there is an overlapping line and it fits in the block, add
+      // it to the block
+      if (line4 && blockFit(blk1, line4)) {
+	if (line3) {
+	  line3->next = line4->next;
+	} else {
+	  lineList = line4->next;
+	}
+	line0->next = line0->flowNext = line4;
+	line4->next = NULL;
+	if (line4->xMin < blk1->xMin) {
+	  blk1->xMin = line4->xMin;
+	} else if (line4->xMax > blk1->xMax) {
+	  blk1->xMax = line4->xMax;
+	}
+	if (line4->yMax > blk1->yMax) {
+	  blk1->yMax = line4->yMax;
+	}
+	if (line4->xSpaceL > blk1->xSpaceL) {
+	  blk1->xSpaceL = line4->xSpaceL;
+	}
+	if (line4->xSpaceR < blk1->xSpaceR) {
+	  blk1->xSpaceR = line4->xSpaceR;
+	}
+	if (line4->fontSize > blk1->maxFontSize) {
+	  blk1->maxFontSize = line4->fontSize;
+	}
+	line0 = line4;
+
+      // otherwise, we're done with this block
+      } else {
+	break;
+      }
+    }
+
+    // insert block on list, in yx order
+    if (rawOrder) {
+      blk2 = blk0;
+      blk3 = NULL;
+      blk0 = blk1;
+    } else {
+      for (blk2 = NULL, blk3 = yxBlocks;
+	   blk3 && !blk1->yxBefore(blk3);
+	   blk2 = blk3, blk3 = blk3->next) ;
+    }
+    blk1->next = blk3;
+    if (blk2) {
+      blk2->next = blk1;
+    } else {
+      yxBlocks = blk1;
+    }
+  }
+
+#if 0 // for debugging
+  printf("*** blocks in yx order ***\n");
+  for (blk0 = yxBlocks; blk0; blk0 = blk0->next) {
+    printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
+	   blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
+    for (line0 = blk0->lines; line0; line0 = line0->next) {
+      printf("  [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+	     line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+	     line0->yBase, line0->len);
+      for (word0 = line0->words; word0; word0 = word0->next) {
+	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+	       word0->yBase, word0->spaceAfter);
+	for (i = 0; i < word0->len; ++i) {
+	  fputc(word0->text[i] & 0xff, stdout);
+	}
+	printf("'\n");
+      }
+    }
+  }
+  printf("\n");
+  fflush(stdout);
+#endif
+
+  //----- merge lines and blocks, sort blocks into reading order
 
-//------------------------------------------------------------------------
-// Character substitutions
-//------------------------------------------------------------------------
+  if (rawOrder) {
+    blocks = yxBlocks;
 
-static char *isoLatin1Subst[] = {
-  "L",				// Lslash
-  "OE",				// OE
-  "S",				// Scaron
-  "Y",				// Ydieresis
-  "Z",				// Zcaron
-  "fi", "fl",			// ligatures
-  "ff", "ffi", "ffl",		// ligatures
-  "i",				// dotlessi
-  "l",				// lslash
-  "oe",				// oe
-  "s",				// scaron
-  "z",				// zcaron
-  "*",				// bullet
-  "...",			// ellipsis
-  "-", "-",			// emdash, hyphen
-  "\"", "\"",			// quotedblleft, quotedblright
-  "'",				// quotesingle
-  "TM"				// trademark
-};
-
-static char *ascii7Subst[] = {
-  "A", "A", "A", "A",		// A{acute,circumflex,dieresis,grave}
-  "A", "A",			// A{ring,tilde}
-  "AE",				// AE
-  "C",				// Ccedilla
-  "E", "E", "E", "E",		// E{acute,circumflex,dieresis,grave}
-  "I", "I", "I", "I",		// I{acute,circumflex,dieresis,grave}
-  "L",				// Lslash
-  "N",				// Ntilde
-  "O", "O", "O", "O",		// O{acute,circumflex,dieresis,grave}
-  "O", "O",			// O{slash,tilde}
-  "OE",				// OE
-  "S",				// Scaron
-  "U", "U", "U", "U",		// U{acute,circumflex,dieresis,grave}
-  "Y", "Y",			// T{acute,dieresis}
-  "Z",				// Zcaron
-  "a", "a", "a", "a",		// a{acute,circumflex,dieresis,grave}
-  "a", "a",			// a{ring,tilde}
-  "ae",				// ae
-  "c",				// ccedilla
-  "e", "e", "e", "e",		// e{acute,circumflex,dieresis,grave}
-  "fi", "fl",			// ligatures
-  "ff", "ffi", "ffl",		// ligatures
-  "i",				// dotlessi
-  "i", "i", "i", "i",		// i{acute,circumflex,dieresis,grave}
-  "l",				// lslash
-  "n",				// ntilde
-  "o", "o", "o", "o",		// o{acute,circumflex,dieresis,grave}
-  "o", "o",			// o{slash,tilde}
-  "oe",				// oe
-  "s",				// scaron
-  "u", "u", "u", "u",		// u{acute,circumflex,dieresis,grave}
-  "y", "y",			// t{acute,dieresis}
-  "z",				// zcaron
-  "|",				// brokenbar
-  "*",				// bullet
-  "...",			// ellipsis
-  "-", "-", "-",		// emdash, endash, hyphen
-  "\"", "\"",			// quotedblleft, quotedblright
-  "'",				// quotesingle
-  "(R)",			// registered
-  "TM"				// trademark
-};
+  } else {
+    blocks = NULL;
+    blk0 = NULL;
+    blkStack = NULL;
+    while (yxBlocks) {
+
+      // find the next two blocks:
+      // - if the depth-first traversal stack is empty, take the first
+      //   (upper-left-most) two blocks on the yx-sorted block list
+      // - otherwise, find the two upper-left-most blocks under the top
+      //   block on the stack
+      if (blkStack) {
+	blk3 = blk4 = blk5 = blk6 = NULL;
+	for (blk1 = NULL, blk2 = yxBlocks;
+	     blk2;
+	     blk1 = blk2, blk2 = blk2->next) {
+	  if (blk2->yMin > blkStack->yMin &&
+	      blk2->xMax > blkStack->xMin &&
+	      blk2->xMin < blkStack->xMax) {
+	    if (!blk4 || blk2->yxBefore(blk4)) {
+	      blk5 = blk3;
+	      blk6 = blk4;
+	      blk3 = blk1;
+	      blk4 = blk2;
+	    } else if (!blk6 || blk2->yxBefore(blk6)) {
+	      blk5 = blk1;
+	      blk6 = blk2;
+	    }
+	  }
+	}
+      } else {
+	blk3 = NULL;
+	blk4 = yxBlocks;
+	blk5 = yxBlocks;
+	blk6 = yxBlocks->next;
+      }
 
-//------------------------------------------------------------------------
-// 16-bit fonts
-//------------------------------------------------------------------------
+      // merge case 1:
+      //   |                     |           |
+      //   |   blkStack          |           |   blkStack
+      //   +---------------------+    -->    +--------------
+      //   +------+ +------+                 +-----------+
+      //   | blk4 | | blk6 | ...             | blk4+blk6 |
+      //   +------+ +------+                 +-----------+
+      yLimit = 0; // make gcc happy
+      if (blkStack) {
+	yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize;
+      }
+      if (blkStack && blk4 && blk6 &&
+	  !blk4->lines->next && !blk6->lines->next &&
+	  lineFit2(blk4->lines, blk6->lines) &&
+	  blk4->yMin < yLimit &&
+	  blk4->xMin > blkStack->xSpaceL &&
+	  blkStack->xMin > blk4->xSpaceL &&
+	  blk6->xMax < blkStack->xSpaceR) {
+	blk4->mergeRight(blk6);
+	if (blk5) {
+	  blk5->next = blk6->next;
+	} else {
+	  yxBlocks = blk6->next;
+	}
+	delete blk6;
+
+      // merge case 2:
+      //   |                     |           |                   |
+      //   |   blkStack          |           |                   |
+      //   +---------------------+    -->    |   blkStack+blk2   |
+      //   +---------------------+           |                   |
+      //   |   blk4              |           |                   |
+      //   |                     |           |                   |
+      } else if (blkStack && blk4 &&
+		 blk4->yMin < yLimit &&
+		 blockFit2(blkStack, blk4)) {
+	blkStack->mergeBelow(blk4);
+	if (blk3) {
+	  blk3->next = blk4->next;
+	} else {
+	  yxBlocks = blk4->next;
+	}
+	delete blk4;
+
+      // if any of:
+      // 1. no block found
+      // 2. non-fully overlapping block found
+      // 3. large vertical gap above the overlapping block
+      // then pop the stack and try again
+      } else if (!blk4 ||
+		 (blkStack && (blk4->xMin < blkStack->xSpaceL ||
+			       blk4->xMax > blkStack->xSpaceR ||
+			       blk4->yMin - blkStack->yMax >
+			         blkMaxSortSpacing * blkStack->maxFontSize))) {
+	blkStack = blkStack->stackNext;
+
+      // add a block to the sorted list
+      } else {
 
-#if JAPANESE_SUPPORT
-
-// CID 0 .. 96
-static Gushort japan12Map[96] = {
-  0x2120, 0x2120, 0x212a, 0x2149, 0x2174, 0x2170, 0x2173, 0x2175, // 00 .. 07
-  0x2147, 0x214a, 0x214b, 0x2176, 0x215c, 0x2124, 0x213e, 0x2123, // 08 .. 0f
-  0x213f, 0x2330, 0x2331, 0x2332, 0x2333, 0x2334, 0x2335, 0x2336, // 10 .. 17
-  0x2337, 0x2338, 0x2339, 0x2127, 0x2128, 0x2163, 0x2161, 0x2164, // 18 .. 1f
-  0x2129, 0x2177, 0x2341, 0x2342, 0x2343, 0x2344, 0x2345, 0x2346, // 20 .. 27
-  0x2347, 0x2348, 0x2349, 0x234a, 0x234b, 0x234c, 0x234d, 0x234e, // 28 .. 2f
-  0x234f, 0x2350, 0x2351, 0x2352, 0x2353, 0x2354, 0x2355, 0x2356, // 30 .. 37
-  0x2357, 0x2358, 0x2359, 0x235a, 0x214e, 0x216f, 0x214f, 0x2130, // 38 .. 3f
-  0x2132, 0x2146, 0x2361, 0x2362, 0x2363, 0x2364, 0x2365, 0x2366, // 40 .. 47
-  0x2367, 0x2368, 0x2369, 0x236a, 0x236b, 0x236c, 0x236d, 0x236e, // 48 .. 4f
-  0x236f, 0x2370, 0x2371, 0x2372, 0x2373, 0x2374, 0x2375, 0x2376, // 50 .. 57
-  0x2377, 0x2378, 0x2379, 0x237a, 0x2150, 0x2143, 0x2151, 0x2141  // 58 .. 5f
-};
-
-// CID 325 .. 421
-static Gushort japan12KanaMap1[97] = {
-  0x2131, 0x2121, 0x2123, 0x2156, 0x2157, 0x2122, 0x2126, 0x2572,
-  0x2521, 0x2523, 0x2525, 0x2527, 0x2529, 0x2563, 0x2565, 0x2567,
-  0x2543, 0x213c, 0x2522, 0x2524, 0x2526, 0x2528, 0x252a, 0x252b,
-  0x252d, 0x252f, 0x2531, 0x2533, 0x2535, 0x2537, 0x2539, 0x253b,
-  0x253d, 0x253f, 0x2541, 0x2544, 0x2546, 0x2548, 0x254a, 0x254b,
-  0x254c, 0x254d, 0x254e, 0x254f, 0x2552, 0x2555, 0x2558, 0x255b,
-  0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2564, 0x2566, 0x2568,
-  0x2569, 0x256a, 0x256b, 0x256c, 0x256d, 0x256f, 0x2573, 0x212b,
-  0x212c, 0x212e, 0x2570, 0x2571, 0x256e, 0x2575, 0x2576, 0x2574,
-  0x252c, 0x252e, 0x2530, 0x2532, 0x2534, 0x2536, 0x2538, 0x253a,
-  0x253c, 0x253e, 0x2540, 0x2542, 0x2545, 0x2547, 0x2549, 0x2550,
-  0x2551, 0x2553, 0x2554, 0x2556, 0x2557, 0x2559, 0x255a, 0x255c,
-  0x255d
-};
-
-// CID 501 .. 598
-static Gushort japan12KanaMap2[98] = {
-  0x212d, 0x212f, 0x216d, 0x214c, 0x214d, 0x2152, 0x2153, 0x2154,
-  0x2155, 0x2158, 0x2159, 0x215a, 0x215b, 0x213d, 0x2121, 0x2472,
-  0x2421, 0x2423, 0x2425, 0x2427, 0x2429, 0x2463, 0x2465, 0x2467,
-  0x2443, 0x2422, 0x2424, 0x2426, 0x2428, 0x242a, 0x242b, 0x242d,
-  0x242f, 0x2431, 0x2433, 0x2435, 0x2437, 0x2439, 0x243b, 0x243d,
-  0x243f, 0x2441, 0x2444, 0x2446, 0x2448, 0x244a, 0x244b, 0x244c,
-  0x244d, 0x244e, 0x244f, 0x2452, 0x2455, 0x2458, 0x245b, 0x245e,
-  0x245f, 0x2460, 0x2461, 0x2462, 0x2464, 0x2466, 0x2468, 0x2469,
-  0x246a, 0x246b, 0x246c, 0x246d, 0x246f, 0x2473, 0x2470, 0x2471,
-  0x246e, 0x242c, 0x242e, 0x2430, 0x2432, 0x2434, 0x2436, 0x2438,
-  0x243a, 0x243c, 0x243e, 0x2440, 0x2442, 0x2445, 0x2447, 0x2449,
-  0x2450, 0x2451, 0x2453, 0x2454, 0x2456, 0x2457, 0x2459, 0x245a,
-  0x245c, 0x245d
-};
-
-static char *japan12Roman[10] = {
-  "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"
-};
-
-static char *japan12Abbrev1[6] = {
-  "mm", "cm", "km", "mg", "kg", "cc"
-};
+	// remove the block from the yx-sorted list
+	if (blk3) {
+	  blk3->next = blk4->next;
+	} else {
+	  yxBlocks = blk4->next;
+	}
+	blk4->next = NULL;
+
+	// append the block to the reading-order list
+	if (blk0) {
+	  blk0->next = blk4;
+	} else {
+	  blocks = blk4;
+	}
+	blk0 = blk4;
 
+	// push the block on the traversal stack
+	if (!physLayout) {
+	  blk4->stackNext = blkStack;
+	  blkStack = blk4;
+	}
+      }
+    }
+  } // (!rawOrder)
+
+#if 0 // for debugging
+  printf("*** blocks in reading order (after merging) ***\n");
+  for (blk0 = blocks; blk0; blk0 = blk0->next) {
+    printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
+	   blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
+    for (line0 = blk0->lines; line0; line0 = line0->next) {
+      printf("  [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+	     line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+	     line0->yBase, line0->len);
+      for (word0 = line0->words; word0; word0 = word0->next) {
+	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+	       word0->yBase, word0->spaceAfter);
+	for (i = 0; i < word0->len; ++i) {
+	  fputc(word0->text[i] & 0xff, stdout);
+	}
+	printf("'\n");
+      }
+    }
+  }
+  printf("\n");
+  fflush(stdout);
 #endif
 
-//------------------------------------------------------------------------
-// TextString
-//------------------------------------------------------------------------
+  //----- assemble blocks into flows
 
-TextString::TextString(GfxState *state, GBool hexCodes1) {
-  double x, y, h;
+  if (rawOrder) {
 
-  state->transform(state->getCurX(), state->getCurY(), &x, &y);
-  h = state->getTransformedFontSize();
-  //~ yMin/yMax computation should use font ascent/descent values
-  yMin = y - 0.95 * h;
-  yMax = yMin + 1.3 * h;
-  col = 0;
-  text = new GString();
-  xRight = NULL;
-  yxNext = NULL;
-  xyNext = NULL;
-  hexCodes = hexCodes1;
-}
+    // one flow per block
+    flow0 = NULL;
+    while (blocks) {
+      flow1 = new TextFlow();
+      flow1->blocks = blocks;
+      flow1->lines = blocks->lines;
+      flow1->yMin = blocks->yMin;
+      flow1->yMax = blocks->yMax;
+      blocks = blocks->next;
+      flow1->blocks->next = NULL;
+      if (flow0) {
+	flow0->next = flow1;
+      } else {
+	flows = flow1;
+      }
+      flow0 = flow1;
+    }
 
-TextString::~TextString() {
-  delete text;
-  gfree(xRight);
-}
+  } else {
 
-void TextString::addChar(GfxState *state, double x, double y,
-			 double dx, double dy,
-			 Guchar c, GBool useASCII7) {
-  char *charName, *sub;
-  int c1;
-  int i, j, n, m;
-
-  // get current index
-  i = text->getLength();
-
-  // append translated character(s) to string
-  sub = NULL;
-  n = 1;
-  if ((charName = state->getFont()->getCharName(c))) {
-    if (useASCII7)
-      c1 = ascii7Encoding.getCharCode(charName);
-    else
-      c1 = isoLatin1Encoding.getCharCode(charName);
-    if (c1 < 0) {
-      m = strlen(charName);
-      if (hexCodes && m == 3 &&
-	  (charName[0] == 'B' || charName[0] == 'C' ||
-	   charName[0] == 'G') &&
-	  isxdigit(charName[1]) && isxdigit(charName[2])) {
-	sscanf(charName+1, "%x", &c1);
-      } else if (!hexCodes && m >= 2 && m <= 3 &&
-		 isdigit(charName[0]) && isdigit(charName[1])) {
-	c1 = atoi(charName);
-	if (c1 >= 256)
-	  c1 = -1;
-      } else if (!hexCodes && m >= 3 && m <= 5 && isdigit(charName[1])) {
-	c1 = atoi(charName+1);
-	if (c1 >= 256)
-	  c1 = -1;
-      }
-      //~ this is a kludge -- is there a standard internal encoding
-      //~ used by all/most Type 1 fonts?
-      if (c1 == 262)		// hyphen
-	c1 = 45;
-      else if (c1 == 266)	// emdash
-	c1 = 208;
-      if (useASCII7)
-	c1 = ascii7Encoding.getCharCode(isoLatin1Encoding.getCharName(c1));
-    }
-    if (useASCII7) {
-      if (c1 >= 128) {
-	sub = ascii7Subst[c1 - 128];
-	n = strlen(sub);
-      }
-    } else {
-      if (c1 >= 256) {
-	sub = isoLatin1Subst[c1 - 256];
-	n = strlen(sub);
+    // compute whitespace above and below each block
+    for (blk0 = blocks; blk0; blk0 = blk0->next) {
+      blk0->ySpaceT = 0;
+      blk0->ySpaceB = pageHeight;
+
+      // check each horizontally overlapping block
+      for (blk1 = blocks; blk1; blk1 = blk1->next) {
+	if (blk1 != blk0 &&
+	    blk1->xMin < blk0->xMax &&
+	    blk1->xMax > blk0->xMin) {
+	  if (blk1->yMax < blk0->yMin) {
+	    if (blk1->yMax > blk0->ySpaceT) {
+	      blk0->ySpaceT = blk1->yMax;
+	    }
+	  } else if (blk1->yMin > blk0->yMax) {
+	    if (blk1->yMin < blk0->ySpaceB) {
+	      blk0->ySpaceB = blk1->yMin;
+	    }
+	  }
+	}
       }
     }
-  } else {
-    c1 = -1;
-  }
-  if (sub)
-    text->append(sub);
-  else if (c1 >= 0)
-    text->append((char)c1);
-  else
-    text->append(' ');
-
-  // update position information
-  if (i+n > ((i+15) & ~15))
-    xRight = (double *)grealloc(xRight, ((i+n+15) & ~15) * sizeof(double));
-  if (i == 0)
-    xMin = x;
-  for (j = 0; j < n; ++j)
-    xRight[i+j] = x + ((j+1) * dx) / n;
-  xMax = x + dx;
-}
-
-void TextString::addChar16(GfxState *state, double x, double y,
-			   double dx, double dy,
-			   int c, GfxFontCharSet16 charSet) {
-  int c1, t1, t2;
-  int sub[8];
-  char *p;
-  int *q;
-  int i, j, n;
-
-  // get current index
-  i = text->getLength();
-
-  // convert the 16-bit character
-  c1 = 0;
-  sub[0] = 0;
-  switch (charSet) {
-
-  // convert Adobe-Japan1-2 to JIS X 0208-1983
-  case font16AdobeJapan12:
-#if JAPANESE_SUPPORT
-    if (c <= 96) {
-      c1 = 0x8080 + japan12Map[c];
-    } else if (c <= 632) {
-      if (c <= 230)
-	c1 = 0;
-      else if (c <= 324)
-	c1 = 0x8080 + japan12Map[c - 230];
-      else if (c <= 421)
-	c1 = 0x8080 + japan12KanaMap1[c - 325];
-      else if (c <= 500)
-	c1 = 0;
-      else if (c <= 598)
-	c1 = 0x8080 + japan12KanaMap2[c - 501];
-      else
-	c1 = 0;
-    } else if (c <= 1124) {
-      if (c <= 779) {
-	if (c <= 726)
-	  c1 = 0xa1a1 + (c - 633);
-	else if (c <= 740)
-	  c1 = 0xa2a1 + (c - 727);
-	else if (c <= 748)
-	  c1 = 0xa2ba + (c - 741);
-	else if (c <= 755)
-	  c1 = 0xa2ca + (c - 749);
-	else if (c <= 770)
-	  c1 = 0xa2dc + (c - 756);
-	else if (c <= 778)
-	  c1 = 0xa2f2 + (c - 771);
-	else
-	  c1 = 0xa2fe;
-      } else if (c <= 841) {
-	if (c <= 789)
-	  c1 = 0xa3b0 + (c - 780);
-	else if (c <= 815)
-	  c1 = 0xa3c1 + (c - 790);
-	else
-	  c1 = 0xa3e1 + (c - 816);
-      } else if (c <= 1010) {
-	if (c <= 924)
-	  c1 = 0xa4a1 + (c - 842);
-	else
-	  c1 = 0xa5a1 + (c - 925);
-      } else {
-	if (c <= 1034)
-	  c1 = 0xa6a1 + (c - 1011);
-	else if (c <= 1058)
-	  c1 = 0xa6c1 + (c - 1035);
-	else if (c <= 1091)
-	  c1 = 0xa7a1 + (c - 1059);
-	else
-	  c1 = 0xa7d1 + (c - 1092);
-      }
-    } else if (c <= 4089) {
-      t1 = (c - 1125) / 94;
-      t2 = (c - 1125) % 94;
-      c1 = 0xb0a1 + (t1 << 8) + t2;
-    } else if (c <= 7477) {
-      t1 = (c - 4090) / 94;
-      t2 = (c - 4090) % 94;
-      c1 = 0xd0a1 + (t1 << 8) + t2;
-    } else if (c <= 7554) {
-      c1 = 0;
-    } else if (c <= 7563) {	// circled Arabic numbers 1..9
-      c1 = 0xa3b1 + (c - 7555);
-    } else if (c <= 7574) {	// circled Arabic numbers 10..20
-      t1 = c - 7564 + 10;
-      sub[0] = 0xa3b0 + (t1 / 10);
-      sub[1] = 0xa3b0 + (t1 % 10);
-      sub[2] = 0;
-      c1 = -1;
-    } else if (c <= 7584) {	// Roman numbers I..X
-      for (p = japan12Roman[c - 7575], q = sub; *p; ++p, ++q) {
-	*q = 0xa380 + *p;
-      }
-      *q = 0;
-      c1 = -1;
-    } else if (c <= 7632) {
-      if (c <= 7600) {
-	c1 = 0;
-      } else if (c <= 7606) {
-	for (p = japan12Abbrev1[c - 7601], q = sub; *p; ++p, ++q) {
-	  *q = 0xa380 + *p;
-	}
-	*q = 0;
-	c1 = -1;
+
+    flow0 = NULL;
+    while (blocks) {
+
+      // build a new flow object
+      flow1 = new TextFlow();
+      flow1->blocks = blocks;
+      flow1->lines = blocks->lines;
+      flow1->yMin = blocks->yMin;
+      flow1->yMax = blocks->yMax;
+      flow1->ySpaceT = blocks->ySpaceT;
+      flow1->ySpaceB = blocks->ySpaceB;
+
+      // find subsequent blocks in the flow
+      for (blk1 = blocks, blk2 = blocks->next;
+	   blk2 && flowFit(flow1, blk2);
+	   blk1 = blk2, blk2 = blk2->next) {
+	if (blk2->yMin < flow1->yMin) {
+	  flow1->yMin = blk2->yMin;
+	}
+	if (blk2->yMax > flow1->yMax) {
+	  flow1->yMax = blk2->yMax;
+	}
+	if (blk2->ySpaceT > flow1->ySpaceT) {
+	  flow1->ySpaceT = blk2->ySpaceT;
+	}
+	if (blk2->ySpaceB < flow1->ySpaceB) {
+	  flow1->ySpaceB = blk2->ySpaceB;
+	}
+	for (line1 = blk1->lines; line1->next; line1 = line1->next) ;
+	line1->flowNext = blk2->lines;
+      }
+
+      // chop the block list
+      blocks = blk1->next;
+      blk1->next = NULL;
+
+      // append the flow to the list
+      if (flow0) {
+	flow0->next = flow1;
       } else {
-	c1 = 0;
+	flows = flow1;
       }
-    } else {
-      c1 = 0;
+      flow0 = flow1;
     }
-#endif // JAPANESE_SUPPORT
-    break;
   }
 
-  // append converted character to string
-  if (c1 == 0) {
-#if 0 //~
-    error(-1, "Unsupported Adobe-Japan1-2 character: %d", c);
+#if 0 // for debugging
+  printf("*** flows ***\n");
+  for (flow0 = flows; flow0; flow0 = flow0->next) {
+    printf("[flow]\n");
+    for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) {
+      printf("  [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n",
+	     blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax,
+	     blk0->ySpaceT, blk0->ySpaceB);
+      for (line0 = blk0->lines; line0; line0 = line0->next) {
+	printf("    [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+	       line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+	       line0->yBase, line0->len);
+	for (word0 = line0->words; word0; word0 = word0->next) {
+	  printf("      word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+		 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+		 word0->yBase, word0->spaceAfter);
+	  for (i = 0; i < word0->len; ++i) {
+	    fputc(word0->text[i] & 0xff, stdout);
+	  }
+	  printf("'\n");
+	}
+      }
+    }
+  }
+  printf("\n");
+  fflush(stdout);
 #endif
-    text->append(' ');
-    n = 1;
-  } else if (c1 > 0) {
-    text->append(c1 >> 8);
-    text->append(c1 & 0xff);
-    n = 2;
+
+  //----- sort lines into yx order
+
+  // (the block/line merging process doesn't maintain the full-page
+  // linked list of lines)
+
+  lines = NULL;
+  if (rawOrder) {
+    line0 = NULL;
+    for (flow0 = flows; flow0; flow0 = flow0->next) {
+      for (line1 = flow0->lines; line1; line1 = line1->flowNext) {
+	if (line0) {
+	  line0->pageNext = line1;
+	} else {
+	  lines = line1;
+	}
+	line0 = line1;
+      }
+    }
   } else {
-    n = 0;
-    for (q = sub; *q; ++q) {
-      text->append(*q >> 8);
-      text->append(*q & 0xff);
-      n += 2;
+    for (flow0 = flows; flow0; flow0 = flow0->next) {
+      for (line0 = flow0->lines; line0; line0 = line0->flowNext) {
+	for (line1 = NULL, line2 = lines;
+	     line2 && !line0->yxBefore(line2);
+	     line1 = line2, line2 = line2->pageNext) ;
+	if (line1) {
+	  line1->pageNext = line0;
+	} else {
+	  lines = line0;
+	}
+	line0->pageNext = line2;
+      }
     }
   }
 
-  // update position information
-  if (i+n > ((i+15) & ~15)) {
-    xRight = (double *)grealloc(xRight, ((i+n+15) & ~15) * sizeof(double));
-  }
-  if (i == 0) {
-    xMin = x;
-  }
-  for (j = 0; j < n; ++j) {
-    xRight[i+j] = x + dx;
+#if 0 // for debugging
+  printf("*** lines in yx order ***\n");
+  for (line0 = lines; line0; line0 = line0->pageNext) {
+    printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n",
+	   line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+	   line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0],
+	   line0->len);
+    for (word0 = line0->words; word0; word0 = word0->next) {
+      printf("  word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+	     word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+	     word0->yBase, word0->spaceAfter);
+      for (i = 0; i < word0->len; ++i) {
+	fputc(word0->text[i] & 0xff, stdout);
+      }
+      printf("'\n");
+    }
   }
-  xMax = x + dx;
+  printf("\n");
+  fflush(stdout);
+#endif
 }
 
-//------------------------------------------------------------------------
-// TextPage
-//------------------------------------------------------------------------
+// If <word> can be added the end of <line>, return the absolute value
+// of the difference between <line>'s baseline and <word>'s baseline,
+// and set *<space> to the horizontal space between the current last
+// word in <line> and <word>.  A smaller return value indicates a
+// better fit.  Otherwise, return a negative number.
+double TextPage::lineFit(TextLine *line, TextWord *word, double *space) {
+  TextWord *lastWord;
+  double fontSize0, fontSize1;
+  double dx, dxLimit;
+
+  lastWord = line->lastWord;
+  fontSize0 = line->fontSize;
+  fontSize1 = word->fontSize;
+  dx = word->xMin - lastWord->xMax;
+  dxLimit = fontSize0 * lastWord->font->maxSpaceWidth;
+
+  // check inter-word spacing
+  if (dx < fontSize0 * lineMinDeltaX ||
+      dx > dxLimit) {
+    return -1;
+  }
 
-TextPage::TextPage(GBool useASCII7, GBool rawOrder) {
-  this->useASCII7 = useASCII7;
-  this->rawOrder = rawOrder;
-  curStr = NULL;
-  yxStrings = NULL;
-  xyStrings = NULL;
-  yxCur1 = yxCur2 = NULL;
-}
+  if (
+      // look for adjacent words with close baselines and close font sizes
+      (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 &&
+       fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
+       fontSize1 < lineMaxFontSizeRatio * fontSize0) ||
+
+      // look for a superscript
+      (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 &&
+       fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 &&
+       (word->yMax < lastWord->yMax ||
+	word->yBase < lastWord->yBase) &&
+       word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 &&
+       dx < fontSize0 * lineMaxSuperscriptDeltaX) ||
+
+      // look for a subscript
+      (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 &&
+       fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 &&
+       (word->yMin > lastWord->yMin ||
+	word->yBase > lastWord->yBase) &&
+       line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 &&
+       dx < fontSize0 * lineMaxSubscriptDeltaX)) {
+
+    *space = dx;
+    return fabs(word->yBase - line->yBase);
+  }
 
-TextPage::~TextPage() {
-  clear();
+  return -1;
 }
 
-void TextPage::beginString(GfxState *state, GString *s, GBool hexCodes) {
-  curStr = new TextString(state, hexCodes);
-}
+// Returns true if <line0> and <line1> can be merged into a single
+// line, ignoring max word spacing.
+GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) {
+  double fontSize0, fontSize1;
+  double dx;
 
-void TextPage::addChar(GfxState *state, double x, double y,
-		       double dx, double dy, Guchar c) {
-  double x1, y1, w1, h1, dx2, dy2;
-  int n;
-  GBool hexCodes;
+  fontSize0 = line0->fontSize;
+  fontSize1 = line1->fontSize;
+  dx = line1->xMin - line0->xMax;
 
-  state->transform(x, y, &x1, &y1);
-  state->textTransformDelta(state->getCharSpace(), 0, &dx2, &dy2);
-  dx -= dx2;
-  dy -= dy2;
-  state->transformDelta(dx, dy, &w1, &h1);
-  n = curStr->text->getLength();
-  if (n > 0 &&
-      x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) {
-    hexCodes = curStr->hexCodes;
-    endString();
-    beginString(state, NULL, hexCodes);
+  // check inter-word spacing
+  if (dx < fontSize0 * lineMinDeltaX) {
+    return gFalse;
   }
-  curStr->addChar(state, x1, y1, w1, h1, c, useASCII7);
-}
 
-void TextPage::addChar16(GfxState *state, double x, double y,
-			 double dx, double dy, int c,
-			 GfxFontCharSet16 charSet) {
-  double x1, y1, w1, h1, dx2, dy2;
-  int n;
-  GBool hexCodes;
-
-  state->transform(x, y, &x1, &y1);
-  state->textTransformDelta(state->getCharSpace(), 0, &dx2, &dy2);
-  dx -= dx2;
-  dy -= dy2;
-  state->transformDelta(dx, dy, &w1, &h1);
-  n = curStr->text->getLength();
-  if (n > 0 &&
-      x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) {
-    hexCodes = curStr->hexCodes;
-    endString();
-    beginString(state, NULL, hexCodes);
+  // look for close baselines and close font sizes
+  if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 &&
+      fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
+      fontSize1 < lineMaxFontSizeRatio * fontSize0) {
+    return gTrue;
   }
-  curStr->addChar16(state, x1, y1, w1, h1, c, charSet);
+
+  return gFalse;
 }
 
-void TextPage::endString() {
-  TextString *p1, *p2;
-  double h, y1, y2;
+// Returns true if <line> can be added to <blk>.  Assumes the y
+// coordinates are within range.
+GBool TextPage::blockFit(TextBlock *blk, TextLine *line) {
+  double fontSize0, fontSize1;
+
+  // check edges
+  if (line->xMin < blk->xSpaceL ||
+      line->xMax > blk->xSpaceR ||
+      blk->xMin < line->xSpaceL ||
+      blk->xMax > line->xSpaceR) {
+    return gFalse;
+  }
 
-  // throw away zero-length strings -- they don't have valid xMin/xMax
-  // values, and they're useless anyway
-  if (curStr->text->getLength() == 0) {
-    delete curStr;
-    curStr = NULL;
-    return;
+  // check font sizes
+  fontSize0 = blk->lines->fontSize;
+  fontSize1 = line->fontSize;
+  if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
+      fontSize1 > blkMaxFontSizeRatio * fontSize0) {
+    return gFalse;
   }
 
-#if 0 //~tmp
-  if (curStr->yMax - curStr->yMin > 20) {
-    delete curStr;
-    curStr = NULL;
-    return;
+  return gTrue;
+}
+
+// Returns true if <blk0> and <blk1> can be merged into a single
+// block.  Assumes the y coordinates are within range.
+GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) {
+  double fontSize0, fontSize1;
+
+  // check edges
+  if (blk1->xMin < blk0->xSpaceL ||
+      blk1->xMax > blk0->xSpaceR ||
+      blk0->xMin < blk1->xSpaceL ||
+      blk0->xMax > blk1->xSpaceR) {
+    return gFalse;
   }
-#endif
 
-  // insert string in y-major list
-  h = curStr->yMax - curStr->yMin;
-  y1 = curStr->yMin + 0.5 * h;
-  y2 = curStr->yMin + 0.8 * h;
-  if (rawOrder) {
-    p1 = yxCur1;
-    p2 = NULL;
-  } else if ((!yxCur1 ||
-	      (y1 >= yxCur1->yMin &&
-	       (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) &&
-	     (!yxCur2 ||
-	      (y1 < yxCur2->yMin ||
-	       (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) {
-    p1 = yxCur1;
-    p2 = yxCur2;
-  } else {
-    for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) {
-      if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin))
-	break;
-    }
-    yxCur2 = p2;
+  // check font sizes
+  fontSize0 = blk0->lines->fontSize;
+  fontSize1 = blk1->lines->fontSize;
+  if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
+      fontSize1 > blkMaxFontSizeRatio * fontSize0) {
+    return gFalse;
   }
-  yxCur1 = curStr;
-  if (p1)
-    p1->yxNext = curStr;
-  else
-    yxStrings = curStr;
-  curStr->yxNext = p2;
-  curStr = NULL;
+
+  return gTrue;
 }
 
-void TextPage::coalesce() {
-  TextString *str1, *str2;
-  double space, d;
-  int n, i;
+// Returns true if <blk> can be added to <flow>.
+GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) {
+  double dy;
 
-#if 0 //~ for debugging
-  for (str1 = yxStrings; str1; str1 = str1->yxNext) {
-    printf("x=%3d..%3d  y=%3d..%3d  size=%2d '%s'\n",
-	   (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax,
-	   (int)(str1->yMax - str1->yMin), str1->text->getCString());
-  }
-  printf("\n------------------------------------------------------------\n\n");
-#endif
-  str1 = yxStrings;
-  while (str1 && (str2 = str1->yxNext)) {
-    space = str1->yMax - str1->yMin;
-    d = str2->xMin - str1->xMax;
-#if 0 //~tmp
-    if (((rawOrder &&
-	  ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) ||
-	   (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) ||
-	 (!rawOrder && str2->yMin < str1->yMax)) &&
-	d > -0.1 * space && d < 0.2 * space) {
-#else
-    if (((rawOrder &&
-	  ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) ||
-	   (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) ||
-	 (!rawOrder && str2->yMin < str1->yMax)) &&
-	d > -0.5 * space && d < space) {
-#endif
-      n = str1->text->getLength();
-      if (d > 0.1 * space)
-	str1->text->append(' ');
-      str1->text->append(str2->text);
-      str1->xRight = (double *)
-	grealloc(str1->xRight, str1->text->getLength() * sizeof(double));
-      if (d > 0.1 * space)
-	str1->xRight[n++] = str2->xMin;
-      for (i = 0; i < str2->text->getLength(); ++i)
-	str1->xRight[n++] = str2->xRight[i];
-      if (str2->xMax > str1->xMax)
-	str1->xMax = str2->xMax;
-      if (str2->yMax > str1->yMax)
-	str1->yMax = str2->yMax;
-      str1->yxNext = str2->yxNext;
-      delete str2;
-    } else {
-      str1 = str2;
-    }
+  // check whitespace above and below
+  if (blk->yMin < flow->ySpaceT ||
+      blk->yMax > flow->ySpaceB ||
+      flow->yMin < blk->ySpaceT ||
+      flow->yMax > blk->ySpaceB) {
+    return gFalse;
   }
+
+  // check that block top edge is within +/- dy of flow top edge,
+  // and that block bottom edge is above flow bottom edge + dy
+  dy = flowMaxDeltaY * flow->blocks->maxFontSize;
+  return blk->yMin > flow->yMin - dy &&
+         blk->yMin < flow->yMin + dy &&
+         blk->yMax < flow->yMax + dy;
 }
 
-GBool TextPage::findText(char *s, GBool top, GBool bottom,
+
+GBool TextPage::findText(Unicode *s, int len,
+			 GBool top, GBool bottom,
 			 double *xMin, double *yMin,
 			 double *xMax, double *yMax) {
-  TextString *str;
-  char *p, *p1, *q;
-  int n, m, i;
-  double x;
+  TextLine *line;
+  Unicode *p;
+  Unicode u1, u2;
+  int m, i, j;
+  double x0, x1, x;
 
-  // scan all strings on page
-  n = strlen(s);
-  for (str = yxStrings; str; str = str->yxNext) {
+  // scan all text on the page
+  for (line = lines; line; line = line->pageNext) {
 
     // check: above top limit?
-    if (!top && (str->yMax < *yMin ||
-		 (str->yMin < *yMin && str->xMax <= *xMin)))
+    if (!top && (line->yMax < *yMin ||
+		 (line->yMin < *yMin && line->xMax <= *xMin))) {
       continue;
+    }
 
     // check: below bottom limit?
-    if (!bottom && (str->yMin > *yMax ||
-		    (str->yMax > *yMax && str->xMin >= *xMax)))
+    if (!bottom && (line->yMin > *yMax ||
+		    (line->yMax > *yMax && line->xMin >= *xMax))) {
       return gFalse;
+    }
+
+    // search each position in this line
+    m = line->len;
+    for (i = 0, p = line->text; i <= m - len; ++i, ++p) {
 
-    // search each position in this string
-    m = str->text->getLength();
-    for (i = 0, p = str->text->getCString(); i <= m - n; ++i, ++p) {
+      x0 = (i == 0) ? line->xMin : line->xRight[i-1];
+      x1 = line->xRight[i];
+      x = 0.5 * (x0 + x1);
 
       // check: above top limit?
-      if (!top && str->yMin < *yMin) {
-	x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2;
-	if (x < *xMin)
+      if (!top && line->yMin < *yMin) {
+	if (x < *xMin) {
 	  continue;
+	}
       }
 
       // check: below bottom limit?
-      if (!bottom && str->yMax > *yMax) {
-	x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2;
-	if (x > *xMax)
+      if (!bottom && line->yMax > *yMax) {
+	if (x > *xMax) {
 	  return gFalse;
+	}
       }
 
       // compare the strings
-      for (p1 = p, q = s; *q; ++p1, ++q) {
-	if (tolower(*p1) != tolower(*q))
+      for (j = 0; j < len; ++j) {
+#if 1 //~ this lowercases Latin A-Z only -- this will eventually be
+      //~ extended to handle other character sets
+	if (p[j] >= 0x41 && p[j] <= 0x5a) {
+	  u1 = p[j] + 0x20;
+	} else {
+	  u1 = p[j];
+	}
+	if (s[j] >= 0x41 && s[j] <= 0x5a) {
+	  u2 = s[j] + 0x20;
+	} else {
+	  u2 = s[j];
+	}
+#endif
+	if (u1 != u2) {
 	  break;
+	}
       }
 
       // found it
-      if (!*q) {
-	*xMin = (i == 0) ? str->xMin : str->xRight[i-1];
-	*xMax = str->xRight[i+n-1];
-	*yMin = str->yMin;
-	*yMax = str->yMax;
+      if (j == len) {
+	*xMin = x0;
+	*xMax = line->xRight[i + len - 1];
+	*yMin = line->yMin;
+	*yMax = line->yMax;
 	return gTrue;
       }
     }
   }
+
   return gFalse;
 }
 
 GString *TextPage::getText(double xMin, double yMin,
 			   double xMax, double yMax) {
   GString *s;
-  TextString *str1;
-  double x0, x1, x2, y;
-  double xPrev, yPrev;
-  int i1, i2;
+  UnicodeMap *uMap;
+  GBool isUnicode;
+  char space[8], eol[16], buf[8];
+  int spaceLen, eolLen, len;
+  TextLine *line, *prevLine;
+  double x0, x1, y;
+  int firstCol, col, i;
   GBool multiLine;
 
   s = new GString();
-  xPrev = yPrev = 0;
-  multiLine = gFalse;
-  for (str1 = yxStrings; str1; str1 = str1->yxNext) {
-    y = 0.5 * (str1->yMin + str1->yMax);
-    if (y > yMax)
+
+  // get the output encoding
+  if (!(uMap = globalParams->getTextEncoding())) {
+    return s;
+  }
+  isUnicode = uMap->isUnicode();
+  spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
+  eolLen = 0; // make gcc happy
+  switch (globalParams->getTextEOL()) {
+  case eolUnix:
+    eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
+    break;
+  case eolDOS:
+    eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
+    eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
+    break;
+  case eolMac:
+    eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
+    break;
+  }
+
+  // find the leftmost column
+  firstCol = -1;
+  for (line = lines; line; line = line->pageNext) {
+    if (line->yMin > yMax) {
       break;
-    if (y > yMin && str1->xMin < xMax && str1->xMax > xMin) {
-      x0 = x1 = x2 = str1->xMin;
-      for (i1 = 0; i1 < str1->text->getLength(); ++i1) {
-	x0 = (i1==0) ? str1->xMin : str1->xRight[i1-1];
-	x1 = str1->xRight[i1];
-	if (0.5 * (x0 + x1) >= xMin)
-	  break;
-      }
-      for (i2 = str1->text->getLength() - 1; i2 > i1; --i2) {
-	x1 = (i2==0) ? str1->xMin : str1->xRight[i2-1];
-	x2 = str1->xRight[i2];
-	if (0.5 * (x1 + x2) <= xMax)
-	  break;
-      }
-      if (s->getLength() > 0) {
-	if (x0 < xPrev || str1->yMin > yPrev) {
-	  s->append('\n');
-	  multiLine = gTrue;
-	} else {
-	  s->append("    ");
-	}
+    }
+    if (line->yMax < yMin || 
+	line->xMax < xMin ||
+	line->xMin > xMax) {
+      continue;
+    }
+
+    y = 0.5 * (line->yMin + line->yMax);
+    if (y < yMin || y > yMax) {
+      continue;
+    }
+
+    i = 0;
+    while (i < line->len) {
+      x0 = (i==0) ? line->xMin : line->xRight[i-1];
+      x1 = line->xRight[i];
+      if (0.5 * (x0 + x1) > xMin) {
+	break;
       }
-      s->append(str1->text->getCString() + i1, i2 - i1 + 1);
-      xPrev = x2;
-      yPrev = str1->yMax;
+      ++i;
+    }
+    if (i == line->len) {
+      continue;
+    }
+    col = line->col[i];
+
+    if (firstCol < 0 || col < firstCol) {
+      firstCol = col;
     }
   }
-  if (multiLine)
-    s->append('\n');
-  return s;
-}
 
-void TextPage::dump(FILE *f) {
-  TextString *str1, *str2, *str3;
-  double yMin, yMax;
-  int col1, col2;
-  double d;
-
-  // build x-major list
-  xyStrings = NULL;
-  for (str1 = yxStrings; str1; str1 = str1->yxNext) {
-    for (str2 = NULL, str3 = xyStrings;
-	 str3;
-	 str2 = str3, str3 = str3->xyNext) {
-      if (str1->xMin < str3->xMin ||
-	  (str1->xMin == str3->xMin && str1->yMin < str3->yMin))
+  // extract the text
+  col = firstCol;
+  multiLine = gFalse;
+  prevLine = NULL;
+  for (line = lines; line; line = line->pageNext) {
+    if (line->yMin > yMax) {
+      break;
+    }
+    if (line->yMax < yMin || 
+	line->xMax < xMin ||
+	line->xMin > xMax) {
+      continue;
+    }
+
+    y = 0.5 * (line->yMin + line->yMax);
+    if (y < yMin || y > yMax) {
+      continue;
+    }
+
+    i = 0;
+    while (i < line->len) {
+      x0 = (i==0) ? line->xMin : line->xRight[i-1];
+      x1 = line->xRight[i];
+      if (0.5 * (x0 + x1) > xMin) {
 	break;
+      }
+      ++i;
+    }
+    if (i == line->len) {
+      continue;
     }
-    if (str2)
-      str2->xyNext = str1;
-    else
-      xyStrings = str1;
-    str1->xyNext = str3;
-  }
 
-  // do column assignment
-  for (str1 = xyStrings; str1; str1 = str1->xyNext) {
-    col1 = 0;
-    for (str2 = xyStrings; str2 != str1; str2 = str2->xyNext) {
-      if (str1->xMin >= str2->xMax) {
-	col2 = str2->col + str2->text->getLength() + 4;
-	if (col2 > col1)
-	  col1 = col2;
-      } else if (str1->xMin > str2->xMin) {
-	col2 = str2->col +
-	       (int)(((str1->xMin - str2->xMin) / (str2->xMax - str2->xMin)) *
-		     str2->text->getLength());
-	if (col2 > col1) {
-	  col1 = col2;
-	}
+    // insert a return
+    if (line->col[i] < col ||
+	(prevLine &&
+	 line->yMin >
+	   prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) {
+      s->append(eol, eolLen);
+      col = firstCol;
+      multiLine = gTrue;
+    }
+    prevLine = line;
+
+    // line this block up with the correct column
+    for (; col < line->col[i]; ++col) {
+      s->append(space, spaceLen);
+    }
+
+    // print the portion of the line
+    for (; i < line->len; ++i) {
+
+      x0 = (i==0) ? line->xMin : line->xRight[i-1];
+      x1 = line->xRight[i];
+      if (0.5 * (x0 + x1) > xMax) {
+	break;
       }
+
+      len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
+      s->append(buf, len);
+      col += isUnicode ? 1 : len;
     }
-    str1->col = col1;
   }
 
-#if 0 //~ for debugging
-  fprintf(f, "~~~~~~~~~~\n");
-  for (str1 = yxStrings; str1; str1 = str1->yxNext) {
-    fprintf(f, "(%4d,%4d) - (%4d,%4d) [%3d] %s\n",
-	    (int)str1->xMin, (int)str1->yMin, (int)str1->xMax, (int)str1->yMax,
-	    str1->col, str1->text->getCString());
+  if (multiLine) {
+    s->append(eol, eolLen);
   }
-  fprintf(f, "~~~~~~~~~~\n");
-#endif
 
-  // output
-  col1 = 0;
-  yMax = yxStrings ? yxStrings->yMax : 0;
-  for (str1 = yxStrings; str1; str1 = str1->yxNext) {
+  uMap->decRefCnt();
 
-    // line this string up with the correct column
-    if (rawOrder && col1 == 0) {
-      col1 = str1->col;
-    } else {
-      for (; col1 < str1->col; ++col1) {
-	fputc(' ', f);
+  return s;
+}
+
+GBool TextPage::findCharRange(int pos, int length,
+			      double *xMin, double *yMin,
+			      double *xMax, double *yMax) {
+  TextLine *line;
+  TextWord *word;
+  double x;
+  GBool first;
+  int i;
+
+  //~ this doesn't correctly handle:
+  //~ - ranges split across multiple lines (the highlighted region
+  //~   is the bounding box of all the parts of the range)
+  //~ - cases where characters don't convert one-to-one into Unicode
+  first = gTrue;
+  for (line = lines; line; line = line->pageNext) {
+    for (word = line->words; word; word = word->next) {
+      if (pos < word->charPos + word->charLen &&
+	  word->charPos < pos + length) {
+	i = pos - word->charPos;
+	if (i < 0) {
+	  i = 0;
+	}
+	x = (i == 0) ? word->xMin : word->xRight[i - 1];
+	if (first || x < *xMin) {
+	  *xMin = x;
+	}
+	i = pos + length - word->charPos;
+	if (i >= word->len) {
+	  i = word->len - 1;
+	}
+	x = word->xRight[i];
+	if (first || x > *xMax) {
+	  *xMax = x;
+	}
+	if (first || word->yMin < *yMin) {
+	  *yMin = word->yMin;
+	}
+	if (first || word->yMax > *yMax) {
+	  *yMax = word->yMax;
+	}
+	first = gFalse;
       }
     }
+  }
+  return !first;
+}
 
-    // print the string
-    fputs(str1->text->getCString(), f);
+void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
+		    GBool physLayout) {
+  UnicodeMap *uMap;
+  char space[8], eol[16], eop[8], buf[8];
+  int spaceLen, eolLen, eopLen, len;
+  TextFlow *flow;
+  TextLine *line;
+  int col, d, n, i;
+
+  // get the output encoding
+  if (!(uMap = globalParams->getTextEncoding())) {
+    return;
+  }
+  spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
+  eolLen = 0; // make gcc happy
+  switch (globalParams->getTextEOL()) {
+  case eolUnix:
+    eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
+    break;
+  case eolDOS:
+    eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
+    eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
+    break;
+  case eolMac:
+    eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
+    break;
+  }
+  eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
 
-    // increment column
-    col1 += str1->text->getLength();
+  // output the page, maintaining the original physical layout
+  if (physLayout || rawOrder) {
+    col = 0;
+    for (line = lines; line; line = line->pageNext) {
 
-    // update yMax for this line
-    if (str1->yMax > yMax)
-      yMax = str1->yMax;
+      // line this block up with the correct column
+      if (!rawOrder) {
+	for (; col < line->col[0]; ++col) {
+	  (*outputFunc)(outputStream, space, spaceLen);
+	}
+      }
 
-    // if we've hit the end of the line...
-#if 0 //~
-    if (!(str1->yxNext &&
-	  !(rawOrder && str1->yxNext->yMax < str1->yMin) &&
-	  str1->yxNext->yMin < str1->yMax &&
-	  str1->yxNext->xMin >= str1->xMax)) {
-#else
-    if (!(str1->yxNext &&
-	  !(rawOrder && str1->yxNext->yMax < str1->yMin) &&
-	  str1->yxNext->yMin < 0.2*str1->yMin + 0.8*str1->yMax &&
-	  str1->yxNext->xMin >= str1->xMax)) {
-#endif
+      // print the line
+      for (i = 0; i < line->len; ++i) {
+	len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
+	(*outputFunc)(outputStream, buf, len);
+      }
+      col += line->convertedLen;
+
+      // print one or more returns if necessary
+      if (rawOrder ||
+	  !line->pageNext ||
+	  line->pageNext->col[0] < col ||
+	  line->pageNext->yMin >
+	    line->yMax - lineOverlapSlack * line->fontSize) {
+
+	// compute number of returns
+	d = 1;
+	if (line->pageNext) {
+	  d += (int)((line->pageNext->yMin - line->yMax) /
+		     line->fontSize + 0.5);
+	}
 
-      // print a return
-      fputc('\n', f);
+	// various things (weird font matrices) can result in bogus
+	// values here, so do a sanity check
+	if (d < 1) {
+	  d = 1;
+	} else if (d > 5) {
+	  d = 5;
+	}
+	for (; d > 0; --d) {
+	  (*outputFunc)(outputStream, eol, eolLen);
+	}
 
-      // print extra vertical space if necessary
-      if (str1->yxNext) {
+	col = 0;
+      }
+    }
 
-	// find yMin for next line
-	yMin = str1->yxNext->yMin;
-	for (str2 = str1->yxNext; str2; str2 = str2->yxNext) {
-	  if (str2->yMin < yMin)
-	    yMin = str2->yMin;
-	  if (!(str2->yxNext && str2->yxNext->yMin < str2->yMax &&
-		str2->yxNext->xMin >= str2->xMax))
-	    break;
+  // output the page, "undoing" the layout
+  } else {
+    for (flow = flows; flow; flow = flow->next) {
+      for (line = flow->lines; line; line = line->flowNext) {
+	n = line->len;
+	if (line->flowNext && line->hyphenated) {
+	  --n;
 	}
-	  
-	// print the space
-	d = (int)((yMin - yMax) / (str1->yMax - str1->yMin) + 0.5);
-	if (rawOrder && d > 2) {
-	  d = 2;
+	for (i = 0; i < n; ++i) {
+	  len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
+	  (*outputFunc)(outputStream, buf, len);
 	}
-	for (; d > 0; --d) {
-	  fputc('\n', f);
+	if (line->flowNext && !line->hyphenated) {
+	  (*outputFunc)(outputStream, space, spaceLen);
 	}
       }
-
-      // set up for next line
-      col1 = 0;
-      yMax = str1->yxNext ? str1->yxNext->yMax : 0;
+      (*outputFunc)(outputStream, eol, eolLen);
+      (*outputFunc)(outputStream, eol, eolLen);
     }
   }
+
+  // end of page
+  (*outputFunc)(outputStream, eop, eopLen);
+  (*outputFunc)(outputStream, eol, eolLen);
+
+  uMap->decRefCnt();
+}
+
+void TextPage::startPage(GfxState *state) {
+  clear();
+  if (state) {
+    pageWidth = state->getPageWidth();
+    pageHeight = state->getPageHeight();
+  } else {
+    pageWidth = pageHeight = 0;
+  }
 }
 
 void TextPage::clear() {
-  TextString *p1, *p2;
+  TextWord *w1, *w2;
+  TextFlow *f1, *f2;
 
-  if (curStr) {
-    delete curStr;
-    curStr = NULL;
+  if (curWord) {
+    delete curWord;
+    curWord = NULL;
   }
-  for (p1 = yxStrings; p1; p1 = p2) {
-    p2 = p1->yxNext;
-    delete p1;
+  if (words) {
+    for (w1 = words; w1; w1 = w2) {
+      w2 = w1->next;
+      delete w1;
+    }
+  } else if (flows) {
+    for (f1 = flows; f1; f1 = f2) {
+      f2 = f1->next;
+      delete f1;
+    }
   }
-  yxStrings = NULL;
-  xyStrings = NULL;
-  yxCur1 = yxCur2 = NULL;
+  deleteGList(fonts, TextFontInfo);
+
+  curWord = NULL;
+  charPos = 0;
+  font = NULL;
+  fontSize = 0;
+  nest = 0;
+  nTinyChars = 0;
+  words = wordPtr = NULL;
+  lines = NULL;
+  flows = NULL;
+  fonts = new GList();
+
 }
 
+
 //------------------------------------------------------------------------
 // TextOutputDev
 //------------------------------------------------------------------------
 
-TextOutputDev::TextOutputDev(char *fileName, GBool useASCII7, GBool rawOrder) {
+static void outputToFile(void *stream, char *text, int len) {
+  fwrite(text, 1, len, (FILE *)stream);
+}
+
+TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
+			     GBool rawOrderA, GBool append) {
   text = NULL;
-  this->rawOrder = rawOrder;
+  physLayout = physLayoutA;
+  rawOrder = rawOrderA;
   ok = gTrue;
 
   // open file
   needClose = gFalse;
   if (fileName) {
     if (!strcmp(fileName, "-")) {
-      f = stdout;
-    } else if ((f = fopen(fileName, "w"))) {
+      outputStream = stdout;
+#ifdef WIN32
+      // keep DOS from munging the end-of-line characters
+      setmode(fileno(stdout), O_BINARY);
+#endif
+    } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
       needClose = gTrue;
     } else {
       error(-1, "Couldn't open text file '%s'", fileName);
       ok = gFalse;
       return;
     }
+    outputFunc = &outputToFile;
   } else {
-    f = NULL;
+    outputStream = NULL;
   }
 
   // set up text object
-  text = new TextPage(useASCII7, rawOrder);
+  text = new TextPage(rawOrderA);
+}
+
+TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
+			     GBool physLayoutA, GBool rawOrderA) {
+  outputFunc = func;
+  outputStream = stream;
+  needClose = gFalse;
+  physLayout = physLayoutA;
+  rawOrder = rawOrderA;
+  text = new TextPage(rawOrderA);
+  ok = gTrue;
 }
 
 TextOutputDev::~TextOutputDev() {
-  if (needClose)
-    fclose(f);
-  if (text)
+  if (needClose) {
+#ifdef MACOS
+    ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
+#endif
+    fclose((FILE *)outputStream);
+  }
+  if (text) {
     delete text;
+  }
 }
 
 void TextOutputDev::startPage(int pageNum, GfxState *state) {
-  text->clear();
+  text->startPage(state);
 }
 
 void TextOutputDev::endPage() {
-  text->coalesce();
-  if (f) {
-    text->dump(f);
-    fputc('\n', f);
-    fputs("\f\n", f);
-    fputc('\n', f);
+  text->coalesce(physLayout);
+  if (outputStream) {
+    text->dump(outputStream, outputFunc, physLayout);
   }
 }
 
 void TextOutputDev::updateFont(GfxState *state) {
-  GfxFont *font;
-  char *charName;
-  int c;
-
-  // look for hex char codes in subsetted font
-  hexCodes = gFalse;
-  if ((font = state->getFont()) && !font->is16Bit()) {
-    for (c = 0; c < 256; ++c) {
-      if ((charName = font->getCharName(c))) {
-	if ((charName[0] == 'B' || charName[0] == 'C' ||
-	     charName[0] == 'G') &&
-	    strlen(charName) == 3 &&
-	    ((charName[1] >= 'a' && charName[1] <= 'f') ||
-	     (charName[1] >= 'A' && charName[1] <= 'F') ||
-	     (charName[2] >= 'a' && charName[2] <= 'f') ||
-	     (charName[2] >= 'A' && charName[2] <= 'F'))) {
-	  hexCodes = gTrue;
-	  break;
-	}
-      }
-    }
-  }
+  text->updateFont(state);
 }
 
 void TextOutputDev::beginString(GfxState *state, GString *s) {
-  text->beginString(state, s, hexCodes);
+  text->beginWord(state, state->getCurX(), state->getCurY());
 }
 
 void TextOutputDev::endString(GfxState *state) {
-  text->endString();
+  text->endWord();
 }
 
 void TextOutputDev::drawChar(GfxState *state, double x, double y,
-			     double dx, double dy, Guchar c) {
-  text->addChar(state, x, y, dx, dy, c);
+			     double dx, double dy,
+			     double originX, double originY,
+			     CharCode c, Unicode *u, int uLen) {
+  text->addChar(state, x, y, dx, dy, c, u, uLen);
 }
 
-void TextOutputDev::drawChar16(GfxState *state, double x, double y,
-			       double dx, double dy, int c) {
-  text->addChar16(state, x, y, dx, dy, c, state->getFont()->getCharSet16());
-}
-
-GBool TextOutputDev::findText(char *s, GBool top, GBool bottom,
+GBool TextOutputDev::findText(Unicode *s, int len,
+			      GBool top, GBool bottom,
 			      double *xMin, double *yMin,
 			      double *xMax, double *yMax) {
-  return text->findText(s, top, bottom, xMin, yMin, xMax, yMax);
+  return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax);
+}
+
+GString *TextOutputDev::getText(double xMin, double yMin,
+				double xMax, double yMax) {
+  return text->getText(xMin, yMin, xMax, yMax);
 }
+
+GBool TextOutputDev::findCharRange(int pos, int length,
+				   double *xMin, double *yMin,
+				   double *xMax, double *yMax) {
+  return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
+}
+
+