/**
*
*/
package de.uni_koeln.ub.drc.reader.temp;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextNormalize;
import org.apache.pdfbox.util.TextPosition;
import org.apache.pdfbox.util.TextPositionComparator;
/**
* subclass of {@link PDFTextStripper PDFTextStripper} that attempts to detect
* white-space between paragraphs. This class inserts an extra newline when it
* finds such. This also inserts an extra newline before each page start.
* Subclasses can override that behavior by overriding the
* {@link #getLineSeparator()}, {@link #getParagraphStart()},
* {@link #getParagraphEnd()} and {@link #getPageSeparator()} methods.
* <p>
* The values used for paragraph drop and indent detection can be set
* programmatically using {@link #setDropThreshold(float)} and
* {@link #setIndentThreshold(float)} or the defaults can be changed using the
* System Properties:
*
* <pre>
* pdftextstripper2.drop
* pdftextstripper2.indent
* </pre>
*
* which can be set using the -D switch at the start of the Java runtime.
* </p>
*
* @author m.martinez@ll.mit.edu
*
*/
/* Third-party code */@SuppressWarnings("all")
public class PDFTextStripper2 extends PDFTextStripper {
private static final Class thisClass = PDFTextStripper2.class;
private static float DEFAULT_INDENT_THRESHOLD = 2.0f;
private static float DEFAULT_DROP_THRESHOLD = 2.5f;
// enable the ability to set the default indent/drop thresholds
// with -D system properties:
// pdftextstripper2.indent
// pdftextstripper2.drop
static {
String prop = thisClass.getSimpleName().toLowerCase() + ".indent";
String s = System.getProperty(prop);
if (s != null && s.length() > 0) {
try {
float f = Float.parseFloat(s);
DEFAULT_INDENT_THRESHOLD = f;
} catch (NumberFormatException nfe) {
// ignore and use default
}
}
prop = thisClass.getSimpleName().toLowerCase() + ".drop";
s = System.getProperty(prop);
if (s != null && s.length() > 0) {
try {
float f = Float.parseFloat(s);
DEFAULT_DROP_THRESHOLD = f;
} catch (NumberFormatException nfe) {
// ignore and use default
}
}
}
private String paragraphStart = "";
private String paragraphEnd = lineSeparator;
private String pageStart = lineSeparator;
private String pageEnd = lineSeparator;
private String articleStart = lineSeparator;
private String articleEnd = lineSeparator;
private float indentThreshold = DEFAULT_INDENT_THRESHOLD;
private float dropThreshold = DEFAULT_DROP_THRESHOLD;
/**
* returns the multiple of whitespace character widths for the current text
* which the current line start can be indented from the previous line start
* beyond which the current line start is considered to be a paragraph
* start.
*
* @return the number of whitespace character widths to use when detecting
* paragraph indents.
*/
public float getIndentThreshold() {
return indentThreshold;
}
/**
* sets the multiple of whitespace character widths for the current text
* which the current line start can be indented from the previous line start
* beyond which the current line start is considered to be a paragraph
* start. The default value is 2.0.
*
* @param indentThreshold
* the number of whitespace character widths to use when
* detecting paragraph indents.
*/
public void setIndentThreshold(float indentThreshold) {
this.indentThreshold = indentThreshold;
}
/**
* the minimum whitespace, as a multiple of the max height of the current
* characters beyond which the current line start is considered to be a
* paragraph start.
*
* @return the character height multiple for max allowed whitespace between
* lines in the same paragraph.
*/
public float getDropThreshold() {
return dropThreshold;
}
/**
* sets the minimum whitespace, as a multiple of the max height of the
* current characters beyond which the current line start is considered to
* be a paragraph start. The default value is 2.5.
*
* @param dropThreshold
* the character height multiple for max allowed whitespace
* between lines in the same paragraph.
*/
public void setDropThreshold(float dropThreshold) {
this.dropThreshold = dropThreshold;
}
public String getParagraphStart() {
return paragraphStart;
}
public void setParagraphStart(String s) {
this.paragraphStart = s;
}
public String getParagraphEnd() {
return paragraphEnd;
}
public void setParagraphEnd(String s) {
this.paragraphEnd = s;
}
public String getPageStart() {
return pageStart;
}
public void setPageStart(String pageStart) {
this.pageStart = pageStart;
}
public String getPageEnd() {
return pageEnd;
}
public void setPageEnd(String pageEnd) {
this.pageEnd = pageEnd;
}
/**
* This will get the page separator used to demark the boundary between
* pages.
*
* @deprecated - not used in PDFTextStripper2. Use discrete
* {@link #getPageStart()} and {@link #getPageEnd()} instead.
* @return The page separator string.
*/
public String getPageSeparator() {
return getPageEnd() + getPageStart();
}
/**
* @deprecated - not used in PDFTextStripper2. Use discrete
* {@link #setPageStart(String)} and {@link #setPageEnd(String)}
* instead.
* @param separator
* The desired page separator string.
*/
public void setPageSeparator(String separator) {
}
public String getArticleStart() {
return articleStart;
}
public void setArticleStart(String articleStart) {
this.articleStart = articleStart;
}
public String getArticleEnd() {
return articleEnd;
}
public void setArticleEnd(String articleEnd) {
this.articleEnd = articleEnd;
}
/**
* {@inheritDoc}
*/
protected void startArticle(boolean isltr) throws IOException {
output.write(getArticleStart());
}
/**
* {@inheritDoc}
*/
protected void endArticle() throws IOException {
output.write(getArticleEnd());
}
/**
* {@inheritDoc}
*/
public PDFTextStripper2() throws IOException {
super();
}
/**
* {@inheritDoc}
*/
public PDFTextStripper2(String s) throws IOException {
super(s);
}
/**
* {@inheritDoc}
*/
public PDFTextStripper2(Properties properties) throws IOException {
super(properties);
}
/**
* handles the line separator for a new line given the specified current and
* previous TextPositions.
*
* @param position
* the current text position
* @param lastPosition
* the previous text position
* @param lastLineStartPosition
* the last text position that followed a line separator.
* @throws IOException
*/
protected PositionWrapper handleLineSeparation(PositionWrapper current,
PositionWrapper lastPosition, PositionWrapper lastLineStartPosition)
throws IOException {
current.setLineStart();
isParagraphSeparation(current, lastPosition, lastLineStartPosition);
lastLineStartPosition = current;
if (current.isParagraphStart()) {
if (lastPosition.isArticleStart()) {
writeParagraphStart();
} else {
writeLineSeparator();
writeParagraphSeparator();
}
} else {
writeLineSeparator();
}
return lastLineStartPosition;
}
/**
* tests the relationship between the last text position, the current text
* position and the last text position that followed a line separator to
* decide if the gap represents a paragraph separation. This should
* <i>only</i> be called for consecutive text positions that first pass the
* line separation test.
* <p>
* This base implementation tests to see if the lastLineStartPosition is
* null OR if the current vertical position has dropped below the last text
* vertical position by at least 2.5 times the current text height OR if the
* current horizontal position is indented by at least 2 times the current
* width of a space character.
* </p>
* <p>
* This also attempts to identify text that is indented under a hanging
* indent.
* </p>
* <p>
* This method sets the isParagraphStart and isHangingIndent flags on the
* current position object.
* </p>
*
* @param position
* the current text position. This may have its isParagraphStart
* or isHangingIndent flags set upon return.
* @param lastPosition
* the previous text position (should not be null).
* @param lastLineStartPosition
* the last text position that followed a line separator. May be
* null.
*/
protected void isParagraphSeparation(PositionWrapper position,
PositionWrapper lastPosition, PositionWrapper lastLineStartPosition) {
boolean result = false;
if (lastLineStartPosition == null) {
result = true;
} else {
float yGap = Math.abs(position.getTextPosition().getYDirAdj()
- lastPosition.getTextPosition().getYDirAdj());
float xGap = (position.getTextPosition().getXDirAdj() - lastLineStartPosition
.getTextPosition().getXDirAdj());// do we need to flip this
// for rtl?
if (yGap > (getDropThreshold() * position.getTextPosition()
.getHeightDir())) {
result = true;
} else if (xGap > (getIndentThreshold() * position
.getTextPosition().getWidthOfSpace())) {
// text is indented, but try to screen for hanging indent
if (!lastLineStartPosition.isParagraphStart()) {
result = true;
} else {
position.setHangingIndent();
}
} else if (xGap < -position.getTextPosition().getWidthOfSpace()) {
// text is left of previous line. Was it a hanging indent?
if (!lastLineStartPosition.isParagraphStart()) {
result = true;
}
} else if (Math.abs(xGap) < (0.25 * position.getTextPosition()
.getWidth())) {
// current horizontal position is within 1/4 a char of the last
// linestart. We'll treat them as lined up.
if (lastLineStartPosition.isHangingIndent()) {
position.setHangingIndent();
} else if (lastLineStartPosition.isParagraphStart()) {
// check to see if the previous line looks like
// any of a number of standard list item formats
Pattern liPattern = matchListItemPattern(lastLineStartPosition);
if (liPattern != null) {
Pattern currentPattern = matchListItemPattern(position);
if (liPattern == currentPattern) {
result = true;
}
}
}
}
}
if (result) {
position.setParagraphStart();
}
}
/**
* returns the list item Pattern object that matches the text at the
* specified PositionWrapper or null if the text does not match such a
* pattern. The list of Patterns tested against is given by the
* {@link #getListItemPatterns()} method. To add to the list, simply
* override that method (if sub-classing) or explicitly supply your own list
* using {@link #setListItemPatterns(List)}.
*
* @param pw
* @return
*/
protected Pattern matchListItemPattern(PositionWrapper pw) {
TextPosition tp = pw.getTextPosition();
String txt = tp.getCharacter();
Pattern p = matchPattern(txt, getListItemPatterns());
return p;
}
/**
* iterates over the specified list of Patterns until it finds one that
* matches the specified string. Then returns the Pattern.
* <p>
* Order of the supplied list of patterns is important as most common
* patterns should come first. Patterns should be strict in general, and all
* will be used with case sensitivity on.
* </p>
*
* @param s
* @param patterns
* @return
*/
protected static final Pattern matchPattern(String s, List<Pattern> patterns) {
Pattern matchedPattern = null;
for (Pattern p : patterns) {
if (p.matcher(s).matches()) {
return p;
}
}
return matchedPattern;
}
private List<Pattern> liPatterns = null;
/**
* a list of regular expressions that match commonly used list item formats,
* i.e. bullets, numbers, letters, Roman numerals, etc. Not meant to be
* comprehensive.
*/
public static final String[] LIST_ITEM_EXPRESSIONS = { "\\.", "\\d+\\.",
"\\[\\d+\\]", "\\d+\\)", "[A-Z]\\.", "[a-z]\\.", "[A-Z]\\)",
"[a-z]\\)", "[IVXL]+\\.", "[ivxl]+\\.",
};
/**
* returns a list of regular expression Patterns representing different
* common list item formats. For example numbered items of form:
* <ol>
* <li>some text</li>
* <li>more text</li>
* </ol>
* or
* <ul>
* <li>some text</li>
* <li>more text</li>
* </ul>
* etc., all begin with some character pattern. The pattern "\\d+\."
* (matches "1.", "2.", ...) or "\[\\d+\]" (matches "[1]", "[2]", ...).
* <p>
* This method returns a list of such regular expression Patterns.
*
* @return a list of Pattern objects.
*/
protected List<Pattern> getListItemPatterns() {
if (liPatterns == null) {
liPatterns = new ArrayList<Pattern>();
for (String expression : LIST_ITEM_EXPRESSIONS) {
Pattern p = Pattern.compile(expression);
liPatterns.add(p);
}
}
return liPatterns;
}
/**
* use to supply a different set of regular expression patterns for matching
* list item starts.
*
* @param patterns
*/
protected void setListItemPatterns(List<Pattern> patterns) {
liPatterns = patterns;
}
/**
* writes the paragraph separator string to the output.
*
* @throws IOException
*/
protected void writeParagraphSeparator() throws IOException {
writeParagraphEnd();
writeParagraphStart();
}
protected void writeParagraphStart() throws IOException {
output.write(getParagraphStart());
}
protected void writeParagraphEnd() throws IOException {
output.write(getParagraphEnd());
}
protected void writePageStart() throws IOException {
output.write(getPageStart());
}
protected void writePageEnd() throws IOException {
output.write(getPageEnd());
}
/**
* The normalizer is used to remove text ligatures/presentation forms and to
* correct the direction of right to left text, such as Arabic and Hebrew.
* <p>
* NOTE - this field duplicates the functionality of the private field by
* the same name in the parent class. Could be eliminated with a couple of
* minor mods of the parent.
* </p>
*/
private TextNormalize normalize = new TextNormalize(this.outputEncoding);
/**
* calculates the vertical overlap of the two specified vertical
* positions+height pairs.
* <p>
* NOTE - this Duplicates functionality of a private method by the same name
* in the parent class.
* </p>
*
* @param y1
* @param height1
* @param y2
* @param height2
* @return
*/
protected final boolean overlap(float y1, float height1, float y2,
float height2) {
return within(y1, y2, .1f) || (y2 <= y1 && y2 >= y1 - height1)
|| (y1 <= y2 && y1 >= y2 - height2);
}
/**
* This will determine of two floating point numbers are within a specified
* variance.
* <p>
* NOTE - this Duplicates functionality of a private method by the same name
* in the parent class.
* </p>
*
* @param first
* The first number to compare to.
* @param second
* The second number to compare to.
* @param variance
* The allowed variance.
*/
protected boolean within(float first, float second, float variance) {
return second > first - variance && second < first + variance;
}
private static final float ENDOFLASTTEXTX_RESET_VALUE = -1;
private static final float MAXYFORLINE_RESET_VALUE = -Float.MAX_VALUE;
private static final float EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE = -Float.MAX_VALUE;
private static final float MAXHEIGHTFORLINE_RESET_VALUE = -1;
private static final float MINYTOPFORLINE_RESET_VALUE = Float.MAX_VALUE;
private static final float LASTWORDSPACING_RESET_VALUE = -1;
/**
* This will print the text of the processed page to "output". It will
* estimate, based on the coordinates of the text, where newlines and word
* spacings should be placed. The text will be sorted only if that feature
* was enabled.
* <p>
* NOTE - this overrides the parent class'
* {@link PDFTextStripper#writePage() writePage} method. It unfortunately
* copies in much of the parent code, with only minor mods to account for
* visibility. Functionally, the main difference is the replacement of the
* call to {@link #writeLineSeparator()} with a call to
* {@link #writeLineSeparator(TextPosition, TextPosition)} instead as well
* as tracking of a some state information during the parse. Copying the
* entire method would not be necessary with only a few mods to the parent
* class.
* </p>
*
* @throws IOException
* If there is an error writing the text.
*/
protected void writePage() throws IOException {
float maxYForLine = MAXYFORLINE_RESET_VALUE;
float minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
float endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
float lastWordSpacing = LASTWORDSPACING_RESET_VALUE;
float maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
PositionWrapper lastPosition = null;
PositionWrapper lastLineStartPosition = null;
boolean startOfPage = true;// flag to indicate start of page
boolean startOfArticle = true;
if (charactersByArticle.size() > 0)
writePageStart();
for (int i = 0; i < charactersByArticle.size(); i++) {
List textList = (List) charactersByArticle.get(i);
if (shouldSortByPosition()) {
TextPositionComparator comparator = new TextPositionComparator();
Collections.sort(textList, comparator);
}
Iterator textIter = textList.iterator();
/*
* Before we can display the text, we need to do some normalizing.
* Arabic and Hebrew text is right to left and is typically stored
* in its logical format, which means that the rightmost character
* is stored first, followed by the second character from the right
* etc. However, PDF stores the text in presentation form, which is
* left to right. We need to do some normalization to convert the
* PDF data to the proper logical output format.
*
* Note that if we did not sort the text, then the output of
* reversing the text is undefined and can sometimes produce worse
* output then not trying to reverse the order. Sorting should be
* done for these languages.
*/
/*
* First step is to determine if we have any right to left text, and
* if so, is it dominant.
*/
int ltrCnt = 0;
int rtlCnt = 0;
while (textIter.hasNext()) {
TextPosition position = (TextPosition) textIter.next();
String stringValue = position.getCharacter();
for (int a = 0; a < stringValue.length(); a++) {
byte dir = Character.getDirectionality(stringValue
.charAt(a));
if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT)
|| (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING)
|| (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)) {
ltrCnt++;
} else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
rtlCnt++;
}
}
}
// choose the dominant direction
boolean isRtlDominant = false;
if (rtlCnt > ltrCnt) {
isRtlDominant = true;
}
startArticle(!isRtlDominant);
startOfArticle = true;
// we will later use this to skip reordering
boolean hasRtl = false;
if (rtlCnt > 0) {
hasRtl = true;
}
/*
* Now cycle through to print the text. We queue up a line at a time
* before we print so that we can convert the line from presentation
* form to logical form (if needed).
*/
// String lineStr = "";
List<TextPosition> line = new ArrayList<TextPosition>();
textIter = textList.iterator(); // start from the beginning again
/*
* PDF files don't always store spaces. We will need to guess where
* we should add spaces based on the distances between
* TextPositions. Historically, this was done based on the size of
* the space character provided by the font. In general, this worked
* but there were cases where it did not work. Calculating the
* average character width and using that as a metric works better
* in some cases but fails in some cases where the spacing worked.
* So we use both. NOTE: Adobe reader also fails on some of these
* examples.
*/
// Keeps track of the previous average character width
float previousAveCharWidth = -1;
while (textIter.hasNext()) {
TextPosition position = (TextPosition) textIter.next();
PositionWrapper current = new PositionWrapper(position);
String characterValue = position.getCharacter();
// Resets the average character width when we see a change in
// font
// or a change in the font size
if (lastPosition != null
&& ((position.getFont() != lastPosition
.getTextPosition().getFont()) || (position
.getFontSize() != lastPosition
.getTextPosition().getFontSize()))) {
previousAveCharWidth = -1;
}
float positionX;
float positionY;
float positionWidth;
float positionHeight;
/*
* If we are sorting, then we need to use the text direction
* adjusted coordinates, because they were used in the sorting.
*/
if (shouldSortByPosition()) {
positionX = position.getXDirAdj();
positionY = position.getYDirAdj();
positionWidth = position.getWidthDirAdj();
positionHeight = position.getHeightDir();
} else {
positionX = position.getX();
positionY = position.getY();
positionWidth = position.getWidth();
positionHeight = position.getHeight();
}
// The current amount of characters in a word
int wordCharCount = position.getIndividualWidths().length;
/*
* Estimate the expected width of the space based on the space
* character with some margin.
*/
float wordSpacing = position.getWidthOfSpace();
float deltaSpace = 0;
if ((wordSpacing == 0) || (wordSpacing == Float.NaN)) {
deltaSpace = Float.MAX_VALUE;
} else {
if (lastWordSpacing < 0) {
deltaSpace = (wordSpacing * getSpacingTolerance());
} else {
deltaSpace = (((wordSpacing + lastWordSpacing) / 2f) * getSpacingTolerance());
}
}
/*
* Estimate the expected width of the space based on the average
* character width with some margin. This calculation does not
* make a true average (average of averages) but we found that
* it gave the best results after numerous experiments. Based on
* experiments we also found that .3 worked well.
*/
float averageCharWidth = -1;
if (previousAveCharWidth < 0) {
averageCharWidth = (positionWidth / wordCharCount);
} else {
averageCharWidth = (previousAveCharWidth + (positionWidth / wordCharCount)) / 2f;
}
float deltaCharWidth = (averageCharWidth * getAverageCharTolerance());
// Compares the values obtained by the average method and the
// wordSpacing method and picks
// the smaller number.
float expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
if (endOfLastTextX != ENDOFLASTTEXTX_RESET_VALUE) {
if (deltaCharWidth > deltaSpace) {
expectedStartOfNextWordX = endOfLastTextX + deltaSpace;
} else {
expectedStartOfNextWordX = endOfLastTextX
+ deltaCharWidth;
}
}
// System.err.println(position.getCharacter()+"\t"+position.getX()+"\t"+position.getY()+" \t"+position.getXDirAdj()+"\t"+position.getYDirAdj()+"\t"+position.getFont());
if (lastPosition != null) {
if (startOfArticle) {
lastPosition.setArticleStart();
startOfArticle = false;
}
// RDD - Here we determine whether this text object is on
// the current
// line. We use the lastBaselineFontSize to handle the
// superscript
// case, and the size of the current font to handle the
// subscript case.
// Text must overlap with the last rendered baseline text by
// at least
// a small amount in order to be considered as being on the
// same line.
/*
* XXX BC: In theory, this check should really check if the
* next char is in full range seen in this line. This is
* what I tried to do with minYTopForLine, but this caused a
* lot of regression test failures. So, I'm leaving it be
* for now.
*/
if (!overlap(positionY, positionHeight, maxYForLine,
maxHeightForLine)) {
// // If we have RTL text on the page, change the
// direction
// if (hasRtl)
// {
// lineStr = normalize.makeLineLogicalOrder(lineStr,
// isRtlDominant);
// }
//
// /* normalize string to remove presentation forms.
// * Note that this must come after the line direction
// * conversion because the process looks ahead to the
// next
// * logical character.
// */
// lineStr = normalize.normalizePres(lineStr);
//
// //writeString(lineStr);
line = normalize(line, isRtlDominant, outputEncoding);
writeLine(line);
line.clear();
// lineStr = "";
lastLineStartPosition = handleLineSeparation(current,
lastPosition, lastLineStartPosition);
endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
maxYForLine = MAXYFORLINE_RESET_VALUE;
maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE;
minYTopForLine = MINYTOPFORLINE_RESET_VALUE;
}
// Test if our TextPosition starts after a new word would be
// expected to start.
if (expectedStartOfNextWordX != EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE
&& expectedStartOfNextWordX < positionX
&&
// only bother adding a space if the last character
// was not a space
lastPosition.getTextPosition().getCharacter() != null
&& !lastPosition.getTextPosition().getCharacter()
.endsWith(" ")) {
// lineStr += getWordSeparator();
line.add(WordSeparator.getSeparator());
}
}
if (positionY >= maxYForLine) {
maxYForLine = positionY;
}
// RDD - endX is what PDF considers to be the x coordinate of
// the
// end position of the text. We use it in computing our metrics
// below.
endOfLastTextX = positionX + positionWidth;
// add it to the list
if (characterValue != null) {
if (startOfPage && lastPosition == null) {
writeParagraphStart();// not sure this is correct for
// RTL?
}
// lineStr += characterValue;
line.add(position);
}
maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
minYTopForLine = Math.min(minYTopForLine, positionY
- positionHeight);
lastPosition = current;
if (startOfPage) {
lastPosition.setParagraphStart();
lastPosition.setLineStart();
lastLineStartPosition = lastPosition;
startOfPage = false;
}
lastWordSpacing = wordSpacing;
previousAveCharWidth = averageCharWidth;
}
// print the final line
// if (lineStr.length() > 0)
if (line.size() > 0) {
// if (hasRtl)
// {
// lineStr = normalize.makeLineLogicalOrder(lineStr,
// isRtlDominant);
// }
//
// // normalize string to remove presentation forms
// lineStr = normalize.normalizePres(lineStr);
//
// //writeString(lineStr);
line = normalize(line, isRtlDominant, outputEncoding);
writeLine(line);
writeParagraphEnd();
}
endArticle();
}
writePageEnd();
}
protected void writeLine(List<TextPosition> line) throws IOException {
for (TextPosition text : line) {
if (text instanceof WordSeparator) {
writeWordSeparator();
} else {
writeCharacters(text);
}
}
}
protected List<TextPosition> normalize(List<TextPosition> line,
boolean isRtlDominant, String outputEncoding) {
LinkedList<TextPosition> normalized = new LinkedList<TextPosition>();
if (isRtlDominant) {
for (TextPosition text : line) {
TextPosition tp = text instanceof WordSeparator ? text
: new NormalizedTextPosition(text, isRtlDominant,
outputEncoding);
normalized.addFirst(tp);
}
} else {
for (TextPosition text : line) {
TextPosition tp = text instanceof WordSeparator ? text
: new NormalizedTextPosition(text, isRtlDominant,
outputEncoding);
normalized.add(tp);
}
}
return normalized;
}
/**
* internal marker class. Used as a place holder in a line of TextPositions.
*
* @author ME21969
*
*/
protected static final class WordSeparator extends TextPosition {
private static final WordSeparator separator = new WordSeparator();
private WordSeparator() {
}
public static final WordSeparator getSeparator() {
return separator;
}
}
protected static class NormalizedTextPosition extends WrappedTextPosition {
protected String outputEncoding = null;
/**
* The normalizer is used to remove text ligatures/presentation forms
* and to correct the direction of right to left text, such as Arabic
* and Hebrew.
*/
private static final Map<String, TextNormalize> normalizers = new HashMap<String, TextNormalize>();
private boolean isRtlDominant = false;
private String normalizedText = null;
public NormalizedTextPosition(TextPosition src, boolean isRtlDominant,
String outputEncoding) {
super(src);
this.outputEncoding = outputEncoding;
this.isRtlDominant = isRtlDominant;
}
protected static final TextNormalize getNormalize(String outputEncoding) {
if (normalizers.get(outputEncoding) == null) {
normalizers.put(outputEncoding, new TextNormalize(
outputEncoding));
}
return normalizers.get(outputEncoding);
}
/**
* returns the text of this TextPosition as a String, after first
* normalizing it in two ways.
* <ol>
* <li>if {@link #isRtlDominant()} is true, then reorders the text to
* logical ordering.</li>
* <li>normalizes for presentation - for example changing ligatures to
* plain-text equivalents.</li>
* </ol>
*/
public String getCharacter() {
if (normalizedText == null) {
normalizedText = src.getCharacter();
if (isRtlDominant) {
normalizedText = getNormalize(outputEncoding)
.makeLineLogicalOrder(normalizedText, isRtlDominant);
}
normalizedText = getNormalize(outputEncoding).normalizePres(
normalizedText);
}
return normalizedText;
}
public boolean isRtlDominant() {
return isRtlDominant;
}
}
protected static class WrappedTextPosition extends TextPosition {
protected TextPosition src = null;
public WrappedTextPosition(TextPosition src) {
super();
this.src = src;
}
public String getCharacter() {
return src.getCharacter();
}
public Matrix getTextPos() {
return src.getTextPos();
}
public float getDir() {
return src.getDir();
}
public float getX() {
return src.getX();
}
public float getXDirAdj() {
return src.getXDirAdj();
}
public float getY() {
return src.getY();
}
public float getYDirAdj() {
return src.getYDirAdj();
}
public float getWidth() {
return src.getWidth();
}
public float getWidthDirAdj() {
return src.getWidthDirAdj();
}
public float getHeight() {
return src.getHeight();
}
public float getHeightDir() {
return src.getHeightDir();
}
public float getFontSize() {
return src.getFontSize();
}
public float getFontSizeInPt() {
return src.getFontSizeInPt();
}
public PDFont getFont() {
return src.getFont();
}
public float getWordSpacing() {
return src.getWordSpacing();
}
public float getWidthOfSpace() {
return src.getWidthOfSpace();
}
public float getXScale() {
return src.getXScale();
}
public float getYScale() {
return src.getYScale();
}
public float[] getIndividualWidths() {
return src.getIndividualWidths();
}
public String toString() {
return src.toString();
}
public boolean contains(TextPosition tp2) {
return src.contains(tp2);
}
public void mergeDiacritic(TextPosition diacritic,
TextNormalize normalize) {
src.mergeDiacritic(diacritic, normalize);
}
public boolean isDiacritic() {
return src.isDiacritic();
}
}
}