/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.util;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.font.PDFont;
/**
* This represents a string and a position on the screen of those characters.
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.12 $
*/
public class TextPosition
{
/* TextMatrix for the start of the text object. Coordinates
* are in display units and have not been adjusted. */
private Matrix textPos;
// ending X and Y coordinates in display units
private float endX;
private float endY;
private float maxTextHeight; // maximum height of text, in display units
private int rot; // 0, 90, 180, 270 degrees of page rotation
private float x = Float.NEGATIVE_INFINITY;
private float y = Float.NEGATIVE_INFINITY;
private float pageHeight;
private float pageWidth;
private float[] widths;
private float widthOfSpace; // width of a space, in display units
private String str;
private PDFont font;
private float fontSize;
private int fontSizePt;
// TODO remove unused value
private float wordSpacing; // word spacing value, in display units
/**
* Constructor.
*/
protected TextPosition()
{
}
/**
* Constructor.
*
* @param page Page that the text is located in
* @param textPositionSt TextMatrix for start of text (in display units)
* @param textPositionEnd TextMatrix for end of text (in display units)
* @param maxFontH Maximum height of text (in display units)
* @param individualWidths The width of each individual character. (in ? units)
* @param spaceWidth The width of the space character. (in display units)
* @param string The character to be displayed.
* @param currentFont The current for for this text position.
* @param fontSizeValue The new font size.
* @param fontSizeInPt The font size in pt units.
* @param ws The word spacing parameter (in display units)
*/
public TextPosition(
PDPage page,
Matrix textPositionSt,
Matrix textPositionEnd,
float maxFontH,
float[] individualWidths,
float spaceWidth,
String string,
PDFont currentFont,
float fontSizeValue,
int fontSizeInPt,
float ws
)
{
this.textPos = textPositionSt;
this.endX = textPositionEnd.getXPosition();
this.endY = textPositionEnd.getYPosition();
this.rot = page.findRotation();
// make sure it is 0 to 270 and no negative numbers
if(this.rot < 0)
{
rot += 360;
}
this.maxTextHeight = maxFontH;
this.pageHeight = page.findMediaBox().getHeight();
this.pageWidth = page.findMediaBox().getWidth();
this.widths = individualWidths;
this.widthOfSpace = spaceWidth;
this.str = string;
this.font = currentFont;
this.fontSize = fontSizeValue;
this.fontSizePt = fontSizeInPt;
this.wordSpacing = ws;
}
/**
* Constructor.
*
* @param pageRotation rotation of the page that the text is located in
* @param pageWidthValue rotation of the page that the text is located in
* @param pageHeightValue rotation of the page that the text is located in
* @param textPositionSt TextMatrix for start of text (in display units)
* @param textPositionEnd TextMatrix for end of text (in display units)
* @param maxFontH Maximum height of text (in display units)
* @param individualWidth The width of the given character/string. (in ? units)
* @param spaceWidth The width of the space character. (in display units)
* @param string The character to be displayed.
* @param currentFont The current for for this text position.
* @param fontSizeValue The new font size.
* @param fontSizeInPt The font size in pt units.
*
* @deprecated Use {@link TextPosition(int, float, float, Matrix, float, float, float, float, float,
* String, PDFont, float, int)} instead.
*/
public TextPosition(
int pageRotation,
float pageWidthValue,
float pageHeightValue,
Matrix textPositionSt,
Matrix textPositionEnd,
float maxFontH,
float individualWidth,
float spaceWidth,
String string,
PDFont currentFont,
float fontSizeValue,
int fontSizeInPt
)
{
this(pageRotation, pageWidthValue, pageHeightValue, textPositionSt,
textPositionEnd.getXPosition(), textPositionEnd.getYPosition(),
maxFontH, individualWidth, spaceWidth, string, currentFont, fontSizeValue, fontSizeInPt);
}
/**
* Constructor.
*
* @param pageRotation rotation of the page that the text is located in
* @param pageWidthValue rotation of the page that the text is located in
* @param pageHeightValue rotation of the page that the text is located in
* @param textPositionSt TextMatrix for start of text (in display units)
* @param endXValue x coordinate of the end position
* @param endYValue y coordinate of the end position
* @param maxFontH Maximum height of text (in display units)
* @param individualWidth The width of the given character/string. (in ? units)
* @param spaceWidth The width of the space character. (in display units)
* @param string The character to be displayed.
* @param currentFont The current for for this text position.
* @param fontSizeValue The new font size.
* @param fontSizeInPt The font size in pt units.
*/
public TextPosition(
int pageRotation,
float pageWidthValue,
float pageHeightValue,
Matrix textPositionSt,
float endXValue,
float endYValue,
float maxFontH,
float individualWidth,
float spaceWidth,
String string,
PDFont currentFont,
float fontSizeValue,
int fontSizeInPt
)
{
this.textPos = textPositionSt;
this.endX = endXValue;
this.endY = endYValue;
this.rot = pageRotation;
// make sure it is 0 to 270 and no negative numbers
if(this.rot < 0)
{
rot += 360;
}
this.maxTextHeight = maxFontH;
this.pageHeight = pageHeightValue;
this.pageWidth = pageWidthValue;
this.widths = new float[]{individualWidth};
this.widthOfSpace = spaceWidth;
this.str = string;
this.font = currentFont;
this.fontSize = fontSizeValue;
this.fontSizePt = fontSizeInPt;
}
/**
* Return the string of characters stored in this object.
*
* @return The string on the screen.
*/
public String getCharacter()
{
return str;
}
/**
* Return the Matrix textPos stored in this object.
*
* @return The Matrix containing all infos of the starting textposition
*/
public Matrix getTextPos()
{
return textPos;
}
/**
* Return the direction/orientation of the string in this object
* based on its text matrix.
* @return The direction of the text (0, 90, 180, or 270)
*/
public float getDir()
{
float a = textPos.getValue(0,0);
float b = textPos.getValue(0,1);
float c = textPos.getValue(1,0);
float d = textPos.getValue(1,1);
// 12 0 left to right
// 0 12
if ((a > 0) && (Math.abs(b) < d) && (Math.abs(c) < a) && (d > 0))
{
return 0;
}
// -12 0 right to left (upside down)
// 0 -12
else if ((a < 0) && (Math.abs(b) < Math.abs(d)) && (Math.abs(c) < Math.abs(a)) && (d < 0))
{
return 180;
}
// 0 12 up
// -12 0
else if ((Math.abs(a) < Math.abs(c)) && (b > 0) && (c < 0) && (Math.abs(d) < b))
{
return 90;
}
// 0 -12 down
// 12 0
else if ((Math.abs(a) < c) && (b < 0) && (c > 0) && (Math.abs(d) < Math.abs(b)))
{
return 270;
}
return 0;
}
/**
* Return the X starting coordinate of the text, adjusted by
* the given rotation amount. The rotation adjusts where the 0,0
* location is relative to the text.
*
* @param rotation Rotation to apply (0, 90, 180, or 270). 0 will perform no adjustments.
* @return X coordinate
*/
private float getXRot(float rotation)
{
if (rotation == 0)
{
return textPos.getValue(2,0);
}
else if (rotation == 90)
{
return textPos.getValue(2,1);
}
else if (rotation == 180)
{
return pageWidth - textPos.getValue(2,0);
}
else if (rotation == 270)
{
return pageHeight - textPos.getValue(2,1);
}
return 0;
}
/**
* This will get the page rotation adjusted x position of the character.
* This is adjusted based on page rotation so that the upper
* left is 0,0.
*
* @return The x coordinate of the character.
*/
public float getX()
{
if (x == Float.NEGATIVE_INFINITY)
{
x = getXRot(rot);
}
return x;
}
/**
* This will get the text direction adjusted x position of the character.
* This is adjusted based on text direction so that the first character
* in that direction is in the upper left at 0,0.
*
* @return The x coordinate of the text.
*/
public float getXDirAdj()
{
return getXRot(getDir());
}
/**
* This will get the y position of the character with 0,0 in lower left.
* This will be adjusted by the given rotation.
* @param rotation Rotation to apply to text to adjust the 0,0 location (0,90,180,270)
*
* @return The y coordinate of the text
*/
private float getYLowerLeftRot(float rotation)
{
if (rotation == 0)
{
return textPos.getValue(2,1);
}
else if (rotation == 90)
{
return pageWidth - textPos.getValue(2,0);
}
else if (rotation == 180)
{
return pageHeight - textPos.getValue(2,1);
}
else if (rotation == 270)
{
return textPos.getValue(2,0);
}
return 0;
}
/**
* This will get the y position of the text, adjusted so that 0,0 is upper left and
* it is adjusted based on the page rotation.
*
* @return The adjusted y coordinate of the character.
*/
public float getY()
{
if (y == Float.NEGATIVE_INFINITY)
{
if ((rot == 0) || (rot == 180))
{
y = pageHeight - getYLowerLeftRot(rot);
}
else
{
y = pageWidth - getYLowerLeftRot(rot);
}
}
return y;
}
/**
* This will get the y position of the text, adjusted so that 0,0 is upper left and
* it is adjusted based on the text direction.
*
* @return The adjusted y coordinate of the character.
*/
public float getYDirAdj()
{
float dir = getDir();
// some PDFBox code assumes that the 0,0 point is in upper left, not lower left
if ((dir == 0) || (dir == 180))
{
return pageHeight - getYLowerLeftRot(dir);
}
else
{
return pageWidth - getYLowerLeftRot(dir);
}
}
/**
* Get the length or width of the text, based on a given rotation.
*
* @param rotation Rotation that was used to determine coordinates (0,90,180,270)
* @return Width of text in display units
*/
private float getWidthRot(float rotation)
{
if ((rotation == 90) || (rotation == 270))
{
return Math.abs(endY - textPos.getYPosition());
}
else
{
return Math.abs(endX - textPos.getXPosition());
}
}
/**
* This will get the width of the string when page rotation adjusted coordinates are used.
*
* @return The width of the text in display units.
*/
public float getWidth()
{
return getWidthRot(rot);
}
/**
* This will get the width of the string when text direction adjusted coordinates are used.
*
* @return The width of the text in display units.
*/
public float getWidthDirAdj()
{
return getWidthRot(getDir());
}
/**
* This will get the maximum height of all characters in this string.
*
* @return The maximum height of all characters in this string.
*/
public float getHeight()
{
return maxTextHeight;
}
/**
* This will get the maximum height of all characters in this string.
*
* @return The maximum height of all characters in this string.
*/
public float getHeightDir()
{
// this is not really a rotation-dependent calculation, but this is defined for symmetry.
return maxTextHeight;
}
/**
* This will get the font size that this object is
* suppose to be drawn at.
*
* @return The font size.
*/
public float getFontSize()
{
return fontSize;
}
/**
* This will get the font size in pt.
* To get this size we have to multiply the pdf-fontsize and the scaling from the textmatrix
*
* @return The font size in pt.
*/
public float getFontSizeInPt()
{
return fontSizePt;
}
/**
* This will get the font for the text being drawn.
*
* @return The font size.
*/
public PDFont getFont()
{
return font;
}
/**
* This will get the current word spacing.
*
* @return The current word spacing.
*/
@Deprecated
public float getWordSpacing()
{
return wordSpacing;
}
/**
* This will get the width of a space character. This is useful for some
* algorithms such as the text stripper, that need to know the width of a
* space character.
*
* @return The width of a space character.
*/
public float getWidthOfSpace()
{
return widthOfSpace;
}
/**
* @return Returns the xScale.
*/
public float getXScale()
{
return textPos.getXScale();
}
/**
* @return Returns the yScale.
*/
public float getYScale()
{
return textPos.getYScale();
}
/**
* Get the widths of each individual character.
*
* @return An array that is the same length as the length of the string.
*/
public float[] getIndividualWidths()
{
return widths;
}
/**
* Show the string data for this text position.
*
* @return A human readable form of this object.
*/
public String toString()
{
return getCharacter();
}
/**
* Determine if this TextPosition logically contains
* another (i.e. they overlap and should be rendered on top
* of each other).
* @param tp2 The other TestPosition to compare against
*
* @return True if tp2 is contained in the bounding box of this text.
*/
public boolean contains( TextPosition tp2)
{
double thisXstart = getXDirAdj();
double thisXend = getXDirAdj() + getWidthDirAdj();
double tp2Xstart = tp2.getXDirAdj();
double tp2Xend = tp2.getXDirAdj() + tp2.getWidthDirAdj();
/*
* No X overlap at all so return as soon as possible.
*/
if(tp2Xend <= thisXstart || tp2Xstart >= thisXend)
{
return false;
}
/*
* No Y overlap at all so return as soon as possible.
* Note: 0.0 is in the upper left and y-coordinate is
* top of TextPosition
*/
if((tp2.getYDirAdj() + tp2.getHeightDir() < getYDirAdj()) ||
(tp2.getYDirAdj() > getYDirAdj() + getHeightDir()))
{
return false;
}
/* We're going to calculate the percentage of overlap. If its less
* than a 15% x-coordinate overlap then we'll return false because its negligible.
* .15 was determined by trial and error in the regression test files.
*/
else if((tp2Xstart > thisXstart) && (tp2Xend > thisXend))
{
double overlap = thisXend - tp2Xstart;
double overlapPercent = overlap/getWidthDirAdj();
return (overlapPercent > .15);
}
else if((tp2Xstart < thisXstart) && (tp2Xend < thisXend))
{
double overlap = tp2Xend - thisXstart;
double overlapPercent = overlap/getWidthDirAdj();
return (overlapPercent > .15);
}
return true;
}
/**
* Merge a single character TextPosition into the current object.
* This is to be used only for cases where we have a diacritic that
* overlaps an existing TextPosition. In a graphical display, we could
* overlay them, but for text extraction we need to merge them. Use the
* contains() method to test if two objects overlap.
*
* @param diacritic TextPosition to merge into the current TextPosition.
* @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
public void mergeDiacritic(TextPosition diacritic, TextNormalize normalize)
{
if (diacritic.getCharacter().length() > 1)
{
return;
}
float diacXStart = diacritic.getXDirAdj();
float diacXEnd = diacXStart + diacritic.widths[0];
float currCharXStart = getXDirAdj();
int strLen = str.length();
boolean wasAdded = false;
for (int i = 0; i < strLen && !wasAdded; i++)
{
float currCharXEnd = currCharXStart + widths[i];
/*
* This is the case where there is an overlap of the diacritic character with
* the current character and the previous character. If no previous character,
* just append the diacritic after the current one.
*/
if(diacXStart < currCharXStart && diacXEnd <= currCharXEnd)
{
if(i == 0)
{
insertDiacritic(i, diacritic, normalize);
}
else
{
float distanceOverlapping1 = diacXEnd - currCharXStart;
float percentage1 = distanceOverlapping1/widths[i];
float distanceOverlapping2 = currCharXStart - diacXStart;
float percentage2 = distanceOverlapping2/widths[i-1];
if(percentage1 >= percentage2)
{
insertDiacritic(i, diacritic, normalize);
}
else
{
insertDiacritic(i-1, diacritic, normalize);
}
}
wasAdded = true;
}
//diacritic completely covers this character and therefore we assume that
//this is the character the diacritic belongs to
else if(diacXStart < currCharXStart && diacXEnd > currCharXEnd)
{
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
//Otherwise, The diacritic modifies this character because its completely
//contained by the character width
else if(diacXStart >= currCharXStart && diacXEnd <= currCharXEnd)
{
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
/*
* Last character in the TextPosition so we add diacritic to the end
*/
else if(diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == (strLen - 1))
{
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
/*
* Couldn't find anything useful so we go to the next character in the
* TextPosition
*/
currCharXStart += widths[i];
}
}
/**
* Inserts the diacritic TextPosition to the str of this TextPosition
* and updates the widths array to include the extra character width.
* @param i current character
* @param diacritic The diacritic TextPosition
* @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
private void insertDiacritic(int i, TextPosition diacritic, TextNormalize normalize)
{
/* we add the diacritic to the right or left of the character
* depending on the direction of the character. Note that this
* is only required because the text is currently stored in
* presentation order and not in logical order.
*/
int dir = Character.getDirectionality(str.charAt(i));
StringBuffer buf = new StringBuffer();
buf.append(str.substring(0,i));
float[] widths2 = new float[widths.length+1];
System.arraycopy(widths, 0, widths2, 0, i);
if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE))
{
buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
widths2[i] = 0;
buf.append(str.charAt(i));
widths2[i+1] = widths[i];
}
else
{
buf.append(str.charAt(i));
widths2[i] = widths[i];
buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
widths2[i+1] = 0;
}
// Get the rest of the string
buf.append(str.substring(i+1, str.length()));
System.arraycopy(widths, i+1, widths2, i+2, widths.length-i-1);
str = buf.toString();
widths = widths2;
}
/**
*
* @return True if the current character is a diacritic char.
*/
public boolean isDiacritic()
{
final String cText = this.getCharacter();
if (cText.length() != 1)
{
return false;
}
final int type = Character.getType(cText.charAt(0));
return (type == Character.NON_SPACING_MARK
|| type == Character.MODIFIER_SYMBOL
|| type == Character.MODIFIER_LETTER);
}
}