PDFTextStripper2.java example

Explorer
drc-master
- de.uni_koeln.ub.drc
  - src
    - java
      - de
        uni_koeln
        ub
        drc
        reader
        ExtractedWord.java
        PageInfo.java
        Paragraph.java
        PdfContentExtractor.java
        Point.java
        temp
        PDFTextStripper2.java
        PositionWrapper.java
    - specs
      - de
        uni_koeln
        ub
        drc
        SpecSuite.java
    - tests
      - de
        uni_koeln
        ub
        drc
        reader
        PdfExtractionTests.java
- de.uni_koeln.ub.drc.ui
  - src
    - de
      - uni_koeln
        ub
        drc
        ui
        ApplicationActionBarAdvisor.java
        ApplicationWorkbenchWindowAdvisor.java
        DrcUiActivator.java
        Messages.java
        Perspective.java
        facades
        CSSSWTConstantsHelper.java
        IDialogConstantsHelper.java
        ImplementationLoader.java
        NLSHelper.java
        ScrolledCompositeHelper.java
        TextHelper.java
        handlers
        ExitHandler.java
        SaveHandler.java
        login
        SimpleLoginDialog.java
        SimpleLoginModule.java
        views
        CheckView.java
        CommentsView.java
        EditComposite.java
        EditView.java
        SearchView.java
        SpecialCharacterView.java
        TableHelper.java
        TagView.java
        UserView.java
        WordView.java
        WordViewModel.java
- de.uni_koeln.ub.drc.ui.rap
  - src
    - de
      - uni_koeln
        ub
        drc
        ui
        facades
        CSSSWTConstantsHelperImpl.java
        IDialogConstantsHelperImpl.java
        NLSHelperImpl.java
        ScrolledCompositeHelperImpl.java
        TextHelperImpl.java
        rap
        ApplicationWorkbenchAdvisor.java
        EntryPoint.java
- de.uni_koeln.ub.drc.ui.rcp
  - src
    - de
      - uni_koeln
        ub
        drc
        ui
        facades
        CSSSWTConstantsHelperImpl.java
        IDialogConstantsHelperImpl.java
        NLSHelperImpl.java
        ScrolledCompositeHelperImpl.java
        TextHelperImpl.java
        rcp
        Application.java
        ApplicationWorkbenchAdvisor.java
  - src-tests
    - de
      - uni_koeln
        ub
        drc
        ui
        AllTestsSuite.java
        SearchEditorTest.java
        ViewsEditorTest.java
- drcp
  - test
    - ApplicationTest.java
    - BasicTest.java
/**
 *
 */
package de.uni_koeln.ub.drc.reader.temp;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;

import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextNormalize;
import org.apache.pdfbox.util.TextPosition;
import org.apache.pdfbox.util.TextPositionComparator;

/**
 * subclass of {@link PDFTextStripper PDFTextStripper} that attempts to detect
 * white-space between paragraphs. This class inserts an extra newline when it
 * finds such. This also inserts an extra newline before each page start.
 * Subclasses can override that behavior by overriding the
 * {@link #getLineSeparator()}, {@link #getParagraphStart()},
 * {@link #getParagraphEnd()} and {@link #getPageSeparator()} methods.
 * <p>
 * The values used for paragraph drop and indent detection can be set
 * programmatically using {@link #setDropThreshold(float)} and
 * {@link #setIndentThreshold(float)} or the defaults can be changed using the
 * System Properties:
 * 
 * <pre>
 *     pdftextstripper2.drop
 *     pdftextstripper2.indent
 * </pre>
 * 
 * which can be set using the -D switch at the start of the Java runtime.
 * </p>
 * 
 * @author m.martinez@ll.mit.edu
 * 
 */
/* Third-party code */@SuppressWarnings("all")
public class PDFTextStripper2 extends PDFTextStripper {

	private static final Class thisClass = PDFTextStripper2.class;

	private static float DEFAULT_INDENT_THRESHOLD = 2.0f;
	private static float DEFAULT_DROP_THRESHOLD = 2.5f;
	// enable the ability to set the default indent/drop thresholds
	// with -D system properties:
	// pdftextstripper2.indent
	// pdftextstripper2.drop
	static {
		String prop = thisClass.getSimpleName().toLowerCase() + ".indent";
		String s = System.getProperty(prop);
		if (s != null && s.length() > 0) {
			try {
				float f = Float.parseFloat(s);
				DEFAULT_INDENT_THRESHOLD = f;
			} catch (NumberFormatException nfe) {
				// ignore and use default
			}
		}
		prop = thisClass.getSimpleName().toLowerCase() + ".drop";
		s = System.getProperty(prop);
		if (s != null && s.length() > 0) {
			try {
				float f = Float.parseFloat(s);
				DEFAULT_DROP_THRESHOLD = f;
			} catch (NumberFormatException nfe) {
				// ignore and use default
			}
		}
	}

	private String paragraphStart = "";
	private String paragraphEnd = lineSeparator;
	private String pageStart = lineSeparator;
	private String pageEnd = lineSeparator;
	private String articleStart = lineSeparator;
	private String articleEnd = lineSeparator;

	private float indentThreshold = DEFAULT_INDENT_THRESHOLD;
	private float dropThreshold = DEFAULT_DROP_THRESHOLD;

	/**
	 * returns the multiple of whitespace character widths for the current text
	 * which the current line start can be indented from the previous line start
	 * beyond which the current line start is considered to be a paragraph
	 * start.
	 * 
	 * @return the number of whitespace character widths to use when detecting
	 *         paragraph indents.
	 */
	public float getIndentThreshold() {
		return indentThreshold;
	}

	/**
	 * sets the multiple of whitespace character widths for the current text
	 * which the current line start can be indented from the previous line start
	 * beyond which the current line start is considered to be a paragraph
	 * start. The default value is 2.0.
	 * 
	 * @param indentThreshold
	 *            the number of whitespace character widths to use when
	 *            detecting paragraph indents.
	 */
	public void setIndentThreshold(float indentThreshold) {
		this.indentThreshold = indentThreshold;
	}

	/**
	 * the minimum whitespace, as a multiple of the max height of the current
	 * characters beyond which the current line start is considered to be a
	 * paragraph start.
	 * 
	 * @return the character height multiple for max allowed whitespace between
	 *         lines in the same paragraph.
	 */
	public float getDropThreshold() {
		return dropThreshold;
	}

	/**
	 * sets the minimum whitespace, as a multiple of the max height of the
	 * current characters beyond which the current line start is considered to
	 * be a paragraph start. The default value is 2.5.
	 * 
	 * @param dropThreshold
	 *            the character height multiple for max allowed whitespace
	 *            between lines in the same paragraph.
	 */
	public void setDropThreshold(float dropThreshold) {
		this.dropThreshold = dropThreshold;
	}

	public String getParagraphStart() {
		return paragraphStart;
	}

	public void setParagraphStart(String s) {
		this.paragraphStart = s;
	}

	public String getParagraphEnd() {
		return paragraphEnd;
	}

	public void setParagraphEnd(String s) {
		this.paragraphEnd = s;
	}

	public String getPageStart() {
		return pageStart;
	}

	public void setPageStart(String pageStart) {
		this.pageStart = pageStart;
	}

	public String getPageEnd() {
		return pageEnd;
	}

	public void setPageEnd(String pageEnd) {
		this.pageEnd = pageEnd;
	}

	/**
	 * This will get the page separator used to demark the boundary between
	 * pages.
	 * 
	 * @deprecated - not used in PDFTextStripper2. Use discrete
	 *             {@link #getPageStart()} and {@link #getPageEnd()} instead.
	 * @return The page separator string.
	 */
	public String getPageSeparator() {
		return getPageEnd() + getPageStart();
	}

	/**
	 * @deprecated - not used in PDFTextStripper2. Use discrete
	 *             {@link #setPageStart(String)} and {@link #setPageEnd(String)}
	 *             instead.
	 * @param separator
	 *            The desired page separator string.
	 */
	public void setPageSeparator(String separator) {
	}

	public String getArticleStart() {
		return articleStart;
	}

	public void setArticleStart(String articleStart) {
		this.articleStart = articleStart;
	}

	public String getArticleEnd() {
		return articleEnd;
	}

	public void setArticleEnd(String articleEnd) {
		this.articleEnd = articleEnd;
	}

	/**
	 * {@inheritDoc}
	 */
	protected void startArticle(boolean isltr) throws IOException {
		output.write(getArticleStart());
	}

	/**
	 * {@inheritDoc}
	 */
	protected void endArticle() throws IOException {
		output.write(getArticleEnd());
	}

	/**
	 * {@inheritDoc}
	 */
	public PDFTextStripper2() throws IOException {
		super();
	}

	/**
	 * {@inheritDoc}
	 */
	public PDFTextStripper2(String s) throws IOException {
		super(s);
	}

	/**
	 * {@inheritDoc}
	 */
	public PDFTextStripper2(Properties properties) throws IOException {
		super(properties);
	}

	/**
	 * handles the line separator for a new line given the specified current and
	 * previous TextPositions.
	 * 
	 * @param position
	 *            the current text position
	 * @param lastPosition
	 *            the previous text position
	 * @param lastLineStartPosition
	 *            the last text position that followed a line separator.
	 * @throws IOException
	 */
	protected PositionWrapper handleLineSeparation(PositionWrapper current,
			PositionWrapper lastPosition, PositionWrapper lastLineStartPosition)
			throws IOException {
		current.setLineStart();
		isParagraphSeparation(current, lastPosition, lastLineStartPosition);
		lastLineStartPosition = current;
		if (current.isParagraphStart()) {
			if (lastPosition.isArticleStart()) {
				writeParagraphStart();
			} else {
				writeLineSeparator();
				writeParagraphSeparator();
			}
		} else {
			writeLineSeparator();
		}
		return lastLineStartPosition;
	}

	/**
	 * tests the relationship between the last text position, the current text
	 * position and the last text position that followed a line separator to
	 * decide if the gap represents a paragraph separation. This should
	 * <i>only</i> be called for consecutive text positions that first pass the
	 * line separation test.
	 * <p>
	 * This base implementation tests to see if the lastLineStartPosition is
	 * null OR if the current vertical position has dropped below the last text
	 * vertical position by at least 2.5 times the current text height OR if the
	 * current horizontal position is indented by at least 2 times the current
	 * width of a space character.
	 * </p>
	 * <p>
	 * This also attempts to identify text that is indented under a hanging
	 * indent.
	 * </p>
	 * <p>
	 * This method sets the isParagraphStart and isHangingIndent flags on the
	 * current position object.
	 * </p>
	 * 
	 * @param position
	 *            the current text position. This may have its isParagraphStart
	 *            or isHangingIndent flags set upon return.
	 * @param lastPosition
	 *            the previous text position (should not be null).
	 * @param lastLineStartPosition
	 *            the last text position that followed a line separator. May be
	 *            null.
	 */
	protected void isParagraphSeparation(PositionWrapper position,
			PositionWrapper lastPosition, PositionWrapper lastLineStartPosition) {
		boolean result = false;
		if (lastLineStartPosition == null) {
			result = true;
		} else {
			float yGap = Math.abs(position.getTextPosition().getYDirAdj()
					- lastPosition.getTextPosition().getYDirAdj());
			float xGap = (position.getTextPosition().getXDirAdj() - lastLineStartPosition
					.getTextPosition().getXDirAdj());// do we need to flip this
														// for rtl?
			if (yGap > (getDropThreshold() * position.getTextPosition()
					.getHeightDir())) {
				result = true;
			} else if (xGap > (getIndentThreshold() * position
					.getTextPosition().getWidthOfSpace())) {
				// text is indented, but try to screen for hanging indent
				if (!lastLineStartPosition.isParagraphStart()) {
					result = true;
				} else {
					position.setHangingIndent();
				}
			} else if (xGap < -position.getTextPosition().getWidthOfSpace()) {
				// text is left of previous line. Was it a hanging indent?
				if (!lastLineStartPosition.isParagraphStart()) {
					result = true;
				}
			} else if (Math.abs(xGap) < (0.25 * position.getTextPosition()
					.getWidth())) {
				// current horizontal position is within 1/4 a char of the last
				// linestart. We'll treat them as lined up.
				if (lastLineStartPosition.isHangingIndent()) {
					position.setHangingIndent();
				} else if (lastLineStartPosition.isParagraphStart()) {
					// check to see if the previous line looks like
					// any of a number of standard list item formats
					Pattern liPattern = matchListItemPattern(lastLineStartPosition);
					if (liPattern != null) {
						Pattern currentPattern = matchListItemPattern(position);
						if (liPattern == currentPattern) {
							result = true;
						}
					}
				}
			}
		}
		if (result) {
			position.setParagraphStart();
		}
	}

	/**
	 * returns the list item Pattern object that matches the text at the
	 * specified PositionWrapper or null if the text does not match such a
	 * pattern. The list of Patterns tested against is given by the
	 * {@link #getListItemPatterns()} method. To add to the list, simply
	 * override that method (if sub-classing) or explicitly supply your own list
	 * using {@link #setListItemPatterns(List)}.
	 * 
	 * @param pw
	 * @return
	 */
	protected Pattern matchListItemPattern(PositionWrapper pw) {
		TextPosition tp = pw.getTextPosition();
		String txt = tp.getCharacter();
		Pattern p = matchPattern(txt, getListItemPatterns());
		return p;
	}

	/**
	 * iterates over the specified list of Patterns until it finds one that
	 * matches the specified string. Then returns the Pattern.
	 * <p>
	 * Order of the supplied list of patterns is important as most common
	 * patterns should come first. Patterns should be strict in general, and all
	 * will be used with case sensitivity on.
	 * </p>
	 * 
	 * @param s
	 * @param patterns
	 * @return
	 */
	protected static final Pattern matchPattern(String s, List<Pattern> patterns) {
		Pattern matchedPattern = null;
		for (Pattern p : patterns) {
			if (p.matcher(s).matches()) {
				return p;
			}
		}
		return matchedPattern;
	}

	private List<Pattern> liPatterns = null;

	/**
	 * a list of regular expressions that match commonly used list item formats,
	 * i.e. bullets, numbers, letters, Roman numerals, etc. Not meant to be
	 * comprehensive.
	 */
	public static final String[] LIST_ITEM_EXPRESSIONS = { "\\.", "\\d+\\.",
			"\\[\\d+\\]", "\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)",
			"[a-z]\\)", "[IVXL]+\\.", "[ivxl]+\\.",

	};

	/**
	 * returns a list of regular expression Patterns representing different
	 * common list item formats. For example numbered items of form:
	 * <ol>
	 * <li>some text</li>
	 * <li>more text</li>
	 * </ol>
	 * or
	 * <ul>
	 * <li>some text</li>
	 * <li>more text</li>
	 * </ul>
	 * etc., all begin with some character pattern. The pattern "\\d+\."
	 * (matches "1.", "2.", ...) or "\[\\d+\]" (matches "[1]", "[2]", ...).
	 * <p>
	 * This method returns a list of such regular expression Patterns.
	 * 
	 * @return a list of Pattern objects.
	 */
	protected List<Pattern> getListItemPatterns() {
		if (liPatterns == null) {
			liPatterns = new ArrayList<Pattern>();
			for (String expression : LIST_ITEM_EXPRESSIONS) {
				Pattern p = Pattern.compile(expression);
				liPatterns.add(p);
			}
		}
		return liPatterns;
	}

	/**
	 * use to supply a different set of regular expression patterns for matching
	 * list item starts.
	 * 
	 * @param patterns
	 */
	protected void setListItemPatterns(List<Pattern> patterns) {
		liPatterns = patterns;
	}

	/**
	 * writes the paragraph separator string to the output.
	 * 
	 * @throws IOException
	 */
	protected void writeParagraphSeparator() throws IOException {
		writeParagraphEnd();
		writeParagraphStart();
	}

	protected void writeParagraphStart() throws IOException {
		output.write(getParagraphStart());
	}

	protected void writeParagraphEnd() throws IOException {
		output.write(getParagraphEnd());
	}

	protected void writePageStart() throws IOException {
		output.write(getPageStart());
	}

	protected void writePageEnd() throws IOException {
		output.write(getPageEnd());
	}

	/**
	 * The normalizer is used to remove text ligatures/presentation forms and to
	 * correct the direction of right to left text, such as Arabic and Hebrew.
	 * <p>
	 * NOTE - this field duplicates the functionality of the private field by
	 * the same name in the parent class. Could be eliminated with a couple of
	 * minor mods of the parent.
	 * </p>
	 */
	private TextNormalize normalize = new TextNormalize(this.outputEncoding);

	/**
	 * calculates the vertical overlap of the two specified vertical
	 * positions+height pairs.
	 * <p>
	 * NOTE - this Duplicates functionality of a private method by the same name
	 * in the parent class.
	 * </p>
	 * 
	 * @param y1
	 * @param height1
	 * @param y2
	 * @param height2
	 * @return
	 */
	protected final boolean overlap(float y1, float height1, float y2,
			float height2) {
		return within(y1, y2, .1f) || (y2 <= y1 && y2 >= y1 - height1)
				|| (y1 <= y2 && y1 >= y2 - height2);
	}

	/**
	 * This will determine of two floating point numbers are within a specified
	 * variance.
	 * <p>
	 * NOTE - this Duplicates functionality of a private method by the same name
	 * in the parent class.
	 * </p>
	 * 
	 * @param first
	 *            The first number to compare to.
	 * @param second
	 *            The second number to compare to.
	 * @param variance
	 *            The allowed variance.
	 */
	protected boolean within(float first, float second, float variance) {
		return second > first - variance && second < first + variance;
	}

	private static final float ENDOFLASTTEXTX_RESET_VALUE = -1;
	private static final float MAXYFORLINE_RESET_VALUE = -Float.MAX_VALUE;
	private static final float EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE = -Float.MAX_VALUE;
	private static final float MAXHEIGHTFORLINE_RESET_VALUE = -1;
	private static final float MINYTOPFORLINE_RESET_VALUE = Float.MAX_VALUE;
	private static final float LASTWORDSPACING_RESET_VALUE = -1;

	/**
	 * This will print the text of the processed page to "output". It will
	 * estimate, based on the coordinates of the text, where newlines and word
	 * spacings should be placed. The text will be sorted only if that feature
	 * was enabled.
	 * <p>
	 * NOTE - this overrides the parent class'
	 * {@link PDFTextStripper#writePage() writePage} method. It unfortunately
	 * copies in much of the parent code, with only minor mods to account for
	 * visibility. Functionally, the main difference is the replacement of the
	 * call to {@link #writeLineSeparator()} with a call to
	 * {@link #writeLineSeparator(TextPosition, TextPosition)} instead as well
	 * as tracking of a some state information during the parse. Copying the
	 * entire method would not be necessary with only a few mods to the parent
	 * class.
	 * </p>
	 * 
	 * @throws IOException
	 *             If there is an error writing the text.
	 */
	protected void writePage() throws IOException {
		float maxYForLine = MAXYFORLINE_RESET_VALUE;
		float minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
		float endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
		float lastWordSpacing = LASTWORDSPACING_RESET_VALUE;
		float maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
		PositionWrapper lastPosition = null;
		PositionWrapper lastLineStartPosition = null;

		boolean startOfPage = true;// flag to indicate start of page
		boolean startOfArticle = true;
		if (charactersByArticle.size() > 0)
			writePageStart();

		for (int i = 0; i < charactersByArticle.size(); i++) {
			List textList = (List) charactersByArticle.get(i);
			if (shouldSortByPosition()) {
				TextPositionComparator comparator = new TextPositionComparator();
				Collections.sort(textList, comparator);
			}

			Iterator textIter = textList.iterator();

			/*
			 * Before we can display the text, we need to do some normalizing.
			 * Arabic and Hebrew text is right to left and is typically stored
			 * in its logical format, which means that the rightmost character
			 * is stored first, followed by the second character from the right
			 * etc. However, PDF stores the text in presentation form, which is
			 * left to right. We need to do some normalization to convert the
			 * PDF data to the proper logical output format.
			 * 
			 * Note that if we did not sort the text, then the output of
			 * reversing the text is undefined and can sometimes produce worse
			 * output then not trying to reverse the order. Sorting should be
			 * done for these languages.
			 */

			/*
			 * First step is to determine if we have any right to left text, and
			 * if so, is it dominant.
			 */
			int ltrCnt = 0;
			int rtlCnt = 0;

			while (textIter.hasNext()) {
				TextPosition position = (TextPosition) textIter.next();
				String stringValue = position.getCharacter();
				for (int a = 0; a < stringValue.length(); a++) {
					byte dir = Character.getDirectionality(stringValue
							.charAt(a));
					if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT)
							|| (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING)
							|| (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)) {
						ltrCnt++;
					} else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
							|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
							|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
							|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
						rtlCnt++;
					}
				}
			}

			// choose the dominant direction
			boolean isRtlDominant = false;
			if (rtlCnt > ltrCnt) {
				isRtlDominant = true;
			}

			startArticle(!isRtlDominant);
			startOfArticle = true;
			// we will later use this to skip reordering
			boolean hasRtl = false;
			if (rtlCnt > 0) {
				hasRtl = true;
			}

			/*
			 * Now cycle through to print the text. We queue up a line at a time
			 * before we print so that we can convert the line from presentation
			 * form to logical form (if needed).
			 */
			// String lineStr = "";
			List<TextPosition> line = new ArrayList<TextPosition>();

			textIter = textList.iterator(); // start from the beginning again
			/*
			 * PDF files don't always store spaces. We will need to guess where
			 * we should add spaces based on the distances between
			 * TextPositions. Historically, this was done based on the size of
			 * the space character provided by the font. In general, this worked
			 * but there were cases where it did not work. Calculating the
			 * average character width and using that as a metric works better
			 * in some cases but fails in some cases where the spacing worked.
			 * So we use both. NOTE: Adobe reader also fails on some of these
			 * examples.
			 */
			// Keeps track of the previous average character width
			float previousAveCharWidth = -1;
			while (textIter.hasNext()) {
				TextPosition position = (TextPosition) textIter.next();
				PositionWrapper current = new PositionWrapper(position);
				String characterValue = position.getCharacter();

				// Resets the average character width when we see a change in
				// font
				// or a change in the font size
				if (lastPosition != null
						&& ((position.getFont() != lastPosition
								.getTextPosition().getFont()) || (position
								.getFontSize() != lastPosition
								.getTextPosition().getFontSize()))) {
					previousAveCharWidth = -1;
				}

				float positionX;
				float positionY;
				float positionWidth;
				float positionHeight;

				/*
				 * If we are sorting, then we need to use the text direction
				 * adjusted coordinates, because they were used in the sorting.
				 */
				if (shouldSortByPosition()) {
					positionX = position.getXDirAdj();
					positionY = position.getYDirAdj();
					positionWidth = position.getWidthDirAdj();
					positionHeight = position.getHeightDir();
				} else {
					positionX = position.getX();
					positionY = position.getY();
					positionWidth = position.getWidth();
					positionHeight = position.getHeight();
				}

				// The current amount of characters in a word
				int wordCharCount = position.getIndividualWidths().length;

				/*
				 * Estimate the expected width of the space based on the space
				 * character with some margin.
				 */
				float wordSpacing = position.getWidthOfSpace();
				float deltaSpace = 0;
				if ((wordSpacing == 0) || (wordSpacing == Float.NaN)) {
					deltaSpace = Float.MAX_VALUE;
				} else {
					if (lastWordSpacing < 0) {
						deltaSpace = (wordSpacing * getSpacingTolerance());
					} else {
						deltaSpace = (((wordSpacing + lastWordSpacing) / 2f) * getSpacingTolerance());
					}
				}

				/*
				 * Estimate the expected width of the space based on the average
				 * character width with some margin. This calculation does not
				 * make a true average (average of averages) but we found that
				 * it gave the best results after numerous experiments. Based on
				 * experiments we also found that .3 worked well.
				 */
				float averageCharWidth = -1;
				if (previousAveCharWidth < 0) {
					averageCharWidth = (positionWidth / wordCharCount);
				} else {
					averageCharWidth = (previousAveCharWidth + (positionWidth / wordCharCount)) / 2f;
				}
				float deltaCharWidth = (averageCharWidth * getAverageCharTolerance());

				// Compares the values obtained by the average method and the
				// wordSpacing method and picks
				// the smaller number.
				float expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
				if (endOfLastTextX != ENDOFLASTTEXTX_RESET_VALUE) {
					if (deltaCharWidth > deltaSpace) {
						expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
					} else {
						expectedStartOfNextWordX = endOfLastTextX
								+ deltaCharWidth;
					}
				}

				// System.err.println(position.getCharacter()+"\t"+position.getX()+"\t"+position.getY()+" \t"+position.getXDirAdj()+"\t"+position.getYDirAdj()+"\t"+position.getFont());
				if (lastPosition != null) {
					if (startOfArticle) {
						lastPosition.setArticleStart();
						startOfArticle = false;
					}
					// RDD - Here we determine whether this text object is on
					// the current
					// line. We use the lastBaselineFontSize to handle the
					// superscript
					// case, and the size of the current font to handle the
					// subscript case.
					// Text must overlap with the last rendered baseline text by
					// at least
					// a small amount in order to be considered as being on the
					// same line.

					/*
					 * XXX BC: In theory, this check should really check if the
					 * next char is in full range seen in this line. This is
					 * what I tried to do with minYTopForLine, but this caused a
					 * lot of regression test failures. So, I'm leaving it be
					 * for now.
					 */
					if (!overlap(positionY, positionHeight, maxYForLine,
							maxHeightForLine)) {
						// // If we have RTL text on the page, change the
						// direction
						// if (hasRtl)
						// {
						// lineStr = normalize.makeLineLogicalOrder(lineStr,
						// isRtlDominant);
						// }
						//
						// /* normalize string to remove presentation forms.
						// * Note that this must come after the line direction
						// * conversion because the process looks ahead to the
						// next
						// * logical character.
						// */
						// lineStr = normalize.normalizePres(lineStr);
						//
						// //writeString(lineStr);
						line = normalize(line, isRtlDominant, outputEncoding);
						writeLine(line);
						line.clear();
						// lineStr = "";

						lastLineStartPosition = handleLineSeparation(current,
								lastPosition, lastLineStartPosition);

						endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
						expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
						maxYForLine = MAXYFORLINE_RESET_VALUE;
						maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
						minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
					}

					// Test if our TextPosition starts after a new word would be
					// expected to start.
					if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE
							&& expectedStartOfNextWordX < positionX
							&&
							// only bother adding a space if the last character
							// was not a space
							lastPosition.getTextPosition().getCharacter() != null
							&& !lastPosition.getTextPosition().getCharacter()
									.endsWith(" ")) {
						// lineStr += getWordSeparator();
						line.add(WordSeparator.getSeparator());
					}
				}

				if (positionY >= maxYForLine) {
					maxYForLine = positionY;
				}

				// RDD - endX is what PDF considers to be the x coordinate of
				// the
				// end position of the text. We use it in computing our metrics
				// below.
				endOfLastTextX = positionX + positionWidth;

				// add it to the list
				if (characterValue != null) {
					if (startOfPage && lastPosition == null) {
						writeParagraphStart();// not sure this is correct for
												// RTL?
					}
					// lineStr += characterValue;
					line.add(position);
				}
				maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
				minYTopForLine = Math.min(minYTopForLine, positionY
						- positionHeight);
				lastPosition = current;
				if (startOfPage) {
					lastPosition.setParagraphStart();
					lastPosition.setLineStart();
					lastLineStartPosition = lastPosition;
					startOfPage = false;
				}
				lastWordSpacing = wordSpacing;
				previousAveCharWidth = averageCharWidth;
			}

			// print the final line
			// if (lineStr.length() > 0)
			if (line.size() > 0) {
				// if (hasRtl)
				// {
				// lineStr = normalize.makeLineLogicalOrder(lineStr,
				// isRtlDominant);
				// }
				//
				// // normalize string to remove presentation forms
				// lineStr = normalize.normalizePres(lineStr);
				//
				// //writeString(lineStr);
				line = normalize(line, isRtlDominant, outputEncoding);
				writeLine(line);
				writeParagraphEnd();
			}

			endArticle();
		}
		writePageEnd();
	}

	protected void writeLine(List<TextPosition> line) throws IOException {
		for (TextPosition text : line) {
			if (text instanceof WordSeparator) {
				writeWordSeparator();
			} else {
				writeCharacters(text);
			}
		}
	}

	protected List<TextPosition> normalize(List<TextPosition> line,
			boolean isRtlDominant, String outputEncoding) {
		LinkedList<TextPosition> normalized = new LinkedList<TextPosition>();
		if (isRtlDominant) {
			for (TextPosition text : line) {
				TextPosition tp = text instanceof WordSeparator ? text
						: new NormalizedTextPosition(text, isRtlDominant,
								outputEncoding);
				normalized.addFirst(tp);
			}
		} else {
			for (TextPosition text : line) {
				TextPosition tp = text instanceof WordSeparator ? text
						: new NormalizedTextPosition(text, isRtlDominant,
								outputEncoding);
				normalized.add(tp);
			}
		}
		return normalized;
	}

	/**
	 * internal marker class. Used as a place holder in a line of TextPositions.
	 * 
	 * @author ME21969
	 * 
	 */
	protected static final class WordSeparator extends TextPosition {
		private static final WordSeparator separator = new WordSeparator();

		private WordSeparator() {
		}

		public static final WordSeparator getSeparator() {
			return separator;
		}

	}

	protected static class NormalizedTextPosition extends WrappedTextPosition {
		protected String outputEncoding = null;
		/**
		 * The normalizer is used to remove text ligatures/presentation forms
		 * and to correct the direction of right to left text, such as Arabic
		 * and Hebrew.
		 */
		private static final Map<String, TextNormalize> normalizers = new HashMap<String, TextNormalize>();
		private boolean isRtlDominant = false;
		private String normalizedText = null;

		public NormalizedTextPosition(TextPosition src, boolean isRtlDominant,
				String outputEncoding) {
			super(src);
			this.outputEncoding = outputEncoding;
			this.isRtlDominant = isRtlDominant;
		}

		protected static final TextNormalize getNormalize(String outputEncoding) {
			if (normalizers.get(outputEncoding) == null) {
				normalizers.put(outputEncoding, new TextNormalize(
						outputEncoding));
			}
			return normalizers.get(outputEncoding);
		}

		/**
		 * returns the text of this TextPosition as a String, after first
		 * normalizing it in two ways.
		 * <ol>
		 * <li>if {@link #isRtlDominant()} is true, then reorders the text to
		 * logical ordering.</li>
		 * <li>normalizes for presentation - for example changing ligatures to
		 * plain-text equivalents.</li>
		 * </ol>
		 */
		public String getCharacter() {
			if (normalizedText == null) {
				normalizedText = src.getCharacter();
				if (isRtlDominant) {
					normalizedText = getNormalize(outputEncoding)
							.makeLineLogicalOrder(normalizedText, isRtlDominant);
				}
				normalizedText = getNormalize(outputEncoding).normalizePres(
						normalizedText);

			}
			return normalizedText;
		}

		public boolean isRtlDominant() {
			return isRtlDominant;
		}

	}

	protected static class WrappedTextPosition extends TextPosition {
		protected TextPosition src = null;

		public WrappedTextPosition(TextPosition src) {
			super();
			this.src = src;
		}

		public String getCharacter() {
			return src.getCharacter();
		}

		public Matrix getTextPos() {
			return src.getTextPos();
		}

		public float getDir() {
			return src.getDir();
		}

		public float getX() {
			return src.getX();
		}

		public float getXDirAdj() {
			return src.getXDirAdj();
		}

		public float getY() {
			return src.getY();
		}

		public float getYDirAdj() {
			return src.getYDirAdj();
		}

		public float getWidth() {
			return src.getWidth();
		}

		public float getWidthDirAdj() {
			return src.getWidthDirAdj();
		}

		public float getHeight() {
			return src.getHeight();
		}

		public float getHeightDir() {
			return src.getHeightDir();
		}

		public float getFontSize() {
			return src.getFontSize();
		}

		public float getFontSizeInPt() {
			return src.getFontSizeInPt();
		}

		public PDFont getFont() {
			return src.getFont();
		}

		public float getWordSpacing() {
			return src.getWordSpacing();
		}

		public float getWidthOfSpace() {
			return src.getWidthOfSpace();
		}

		public float getXScale() {
			return src.getXScale();
		}

		public float getYScale() {
			return src.getYScale();
		}

		public float[] getIndividualWidths() {
			return src.getIndividualWidths();
		}

		public String toString() {
			return src.toString();
		}

		public boolean contains(TextPosition tp2) {
			return src.contains(tp2);
		}

		public void mergeDiacritic(TextPosition diacritic,
				TextNormalize normalize) {
			src.mergeDiacritic(diacritic, normalize);
		}

		public boolean isDiacritic() {
			return src.isDiacritic();
		}
	}
}