package com.tom_roush.pdfbox.text;
import android.util.Log;
import com.tom_roush.pdfbox.pdmodel.font.PDFont;
import com.tom_roush.pdfbox.util.Matrix;
import java.text.Normalizer;
import java.util.HashMap;
import java.util.Map;
/**
* This represents a string and a position on the screen of those characters.
*
* @author Ben Litchfield
*/
public final class TextPosition
{
private static final Map<Integer, String> DIACRITICS = createDiacritics();
// Adds non-decomposing diacritics to the hash with their related combining character.
// These are values that the unicode spec claims are equivalent but are not mapped in the form
// NFKC normalization method. Determined by going through the Combining Diacritical Marks
// section of the Unicode spec and identifying which characters are not mapped to by the
// normalization.
private static Map<Integer, String> createDiacritics()
{
HashMap<Integer, String> map = new HashMap<Integer, String>();
map.put(0x0060, "\u0300");
map.put(0x02CB, "\u0300");
map.put(0x0027, "\u0301");
map.put(0x02B9, "\u0301");
map.put(0x02CA, "\u0301");
map.put(0x005e, "\u0302");
map.put(0x02C6, "\u0302");
map.put(0x007E, "\u0303");
map.put(0x02C9, "\u0304");
map.put(0x00B0, "\u030A");
map.put(0x02BA, "\u030B");
map.put(0x02C7, "\u030C");
map.put(0x02C8, "\u030D");
map.put(0x0022, "\u030E");
map.put(0x02BB, "\u0312");
map.put(0x02BC, "\u0313");
map.put(0x0486, "\u0313");
map.put(0x055A, "\u0313");
map.put(0x02BD, "\u0314");
map.put(0x0485, "\u0314");
map.put(0x0559, "\u0314");
map.put(0x02D4, "\u031D");
map.put(0x02D5, "\u031E");
map.put(0x02D6, "\u031F");
map.put(0x02D7, "\u0320");
map.put(0x02B2, "\u0321");
map.put(0x02CC, "\u0329");
map.put(0x02B7, "\u032B");
map.put(0x02CD, "\u0331");
map.put(0x005F, "\u0332");
map.put(0x204E, "\u0359");
return map;
}
// text matrix for the start of the text object, coordinates are in display units
// and have not been adjusted
private final Matrix textMatrix;
// ending X and Y coordinates in display units
private final float endX;
private final float endY;
private final float maxHeight; // maximum height of text, in display units
private final int rotation; // 0, 90, 180, 270 degrees of page rotation
private final float x;
private final float y;
private final float pageHeight;
private final float pageWidth;
private final float widthOfSpace; // width of a space, in display units
private final int[] charCodes; // internal PDF character codes
private final PDFont font;
private final float fontSize;
private final int fontSizePt;
// mutable
private float[] widths;
private String unicode;
/**
* Constructor.
*
* @param pageRotation rotation of the page that the text is located in
* @param pageWidth rotation of the page that the text is located in
* @param pageHeight rotation of the page that the text is located in
* @param textMatrix TextMatrix for start of text (in display units)
* @param endX x coordinate of the end position
* @param endY y coordinate of the end position
* @param maxHeight Maximum height of text (in display units)
* @param individualWidth The width of the given character/string. (in text units)
* @param spaceWidth The width of the space character. (in display units)
* @param unicode The string of Unicode characters to be displayed.
* @param charCodes An array of the internal PDF character codes for the glyphs in this text.
* @param font The current font for this text position.
* @param fontSize The new font size.
* @param fontSizeInPt The font size in pt units.
*/
public TextPosition(int pageRotation, float pageWidth, float pageHeight, Matrix textMatrix,
float endX, float endY, float maxHeight, float individualWidth,
float spaceWidth, String unicode, int[] charCodes, PDFont font,
float fontSize, int fontSizeInPt)
{
this.textMatrix = textMatrix;
this.endX = endX;
this.endY = endY;
int rotationAngle = pageRotation;
this.rotation = rotationAngle;
this.maxHeight = maxHeight;
this.pageHeight = pageHeight;
this.pageWidth = pageWidth;
this.widths = new float[] { individualWidth };
this.widthOfSpace = spaceWidth;
this.unicode = unicode;
this.charCodes = charCodes;
this.font = font;
this.fontSize = fontSize;
this.fontSizePt = fontSizeInPt;
x = getXRot(rotationAngle);
if (rotationAngle == 0 || rotationAngle == 180)
{
y = this.pageHeight - getYLowerLeftRot(rotationAngle);
}
else
{
y = this.pageWidth - getYLowerLeftRot(rotationAngle);
}
}
/**
* Return the string of characters stored in this object.
*
* @return The string on the screen.
*/
public String getUnicode()
{
return unicode;
}
/**
* Return the internal PDF character codes of the glyphs in this text.
*
* @return an array of internal PDF character codes
*/
public int[] getCharacterCodes()
{
return charCodes;
}
/**
* Return the text matrix stored in this object.
*
* @return The Matrix containing the starting text position
*/
public Matrix getTextMatrix()
{
return textMatrix;
}
/**
* Return the direction/orientation of the string in this object based on its text matrix.
* @return The direction of the text (0, 90, 180, or 270)
*/
public float getDir()
{
float a = textMatrix.getScaleY();
float b = textMatrix.getShearY();
float c = textMatrix.getScaleX();
float d = textMatrix.getShearX();
// 12 0 left to right
// 0 12
if (a > 0 && Math.abs(b) < d && Math.abs(c) < a && d > 0)
{
return 0;
}
// -12 0 right to left (upside down)
// 0 -12
else if (a < 0 && Math.abs(b) < Math.abs(d) && Math.abs(c) < Math.abs(a) && d < 0)
{
return 180;
}
// 0 12 up
// -12 0
else if (Math.abs(a) < Math.abs(c) && b > 0 && c < 0 && Math.abs(d) < b)
{
return 90;
}
// 0 -12 down
// 12 0
else if (Math.abs(a) < c && b < 0 && c > 0 && Math.abs(d) < Math.abs(b))
{
return 270;
}
return 0;
}
/**
* Return the X starting coordinate of the text, adjusted by the given rotation amount.
* The rotation adjusts where the 0,0 location is relative to the text.
*
* @param rotation Rotation to apply (0, 90, 180, or 270). 0 will perform no adjustments.
* @return X coordinate
*/
private float getXRot(float rotation)
{
if (rotation == 0)
{
return textMatrix.getTranslateX();
}
else if (rotation == 90)
{
return textMatrix.getTranslateY();
}
else if (rotation == 180)
{
return pageWidth - textMatrix.getTranslateX();
}
else if (rotation == 270)
{
return pageHeight - textMatrix.getTranslateX();
}
return 0;
}
/**
* This will get the page rotation adjusted x position of the character.
* This is adjusted based on page rotation so that the upper left is 0,0.
*
* @return The x coordinate of the character.
*/
public float getX()
{
return x;
}
/**
* This will get the text direction adjusted x position of the character.
* This is adjusted based on text direction so that the first character
* in that direction is in the upper left at 0,0.
*
* @return The x coordinate of the text.
*/
public float getXDirAdj()
{
return getXRot(getDir());
}
/**
* This will get the y position of the character with 0,0 in lower left.
* This will be adjusted by the given rotation.
*
* @param rotation Rotation to apply to text to adjust the 0,0 location (0,90,180,270)
* @return The y coordinate of the text
*/
private float getYLowerLeftRot(float rotation)
{
if (rotation == 0)
{
return textMatrix.getTranslateY();
}
else if (rotation == 90)
{
return pageWidth - textMatrix.getTranslateX();
}
else if (rotation == 180)
{
return pageHeight - textMatrix.getTranslateY();
}
else if (rotation == 270)
{
return textMatrix.getTranslateX();
}
return 0;
}
/**
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is
* adjusted based on the page rotation.
*
* @return The adjusted y coordinate of the character.
*/
public float getY()
{
return y;
}
/**
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is
* adjusted based on the text direction.
*
* @return The adjusted y coordinate of the character.
*/
public float getYDirAdj()
{
float dir = getDir();
// some PDFBox code assumes that the 0,0 point is in upper left, not lower left
if (dir == 0 || dir == 180)
{
return pageHeight - getYLowerLeftRot(dir);
}
else
{
return pageWidth - getYLowerLeftRot(dir);
}
}
/**
* Get the length or width of the text, based on a given rotation.
*
* @param rotation Rotation that was used to determine coordinates (0,90,180,270)
* @return Width of text in display units
*/
private float getWidthRot(float rotation)
{
if (rotation == 90 || rotation == 270)
{
return Math.abs(endY - textMatrix.getTranslateY());
}
else
{
return Math.abs(endX - textMatrix.getTranslateX());
}
}
/**
* This will get the width of the string when page rotation adjusted coordinates are used.
*
* @return The width of the text in display units.
*/
public float getWidth()
{
return getWidthRot(rotation);
}
/**
* This will get the width of the string when text direction adjusted coordinates are used.
*
* @return The width of the text in display units.
*/
public float getWidthDirAdj()
{
return getWidthRot(getDir());
}
/**
* This will get the maximum height of all characters in this string.
*
* @return The maximum height of all characters in this string.
*/
public float getHeight()
{
return maxHeight;
}
/**
* This will get the maximum height of all characters in this string.
*
* @return The maximum height of all characters in this string.
*/
public float getHeightDir()
{
// this is not really a rotation-dependent calculation, but this is defined for symmetry
return maxHeight;
}
/**
* This will get the font size that this object is suppose to be drawn at.
*
* @return The font size.
*/
public float getFontSize()
{
return fontSize;
}
/**
* This will get the font size in pt. To get this size we have to multiply the pdf-fontsize
* and the scaling from the textmatrix
*
* @return The font size in pt.
*/
public float getFontSizeInPt()
{
return fontSizePt;
}
/**
* This will get the font for the text being drawn.
*
* @return The font size.
*/
public PDFont getFont()
{
return font;
}
/**
* This will get the width of a space character. This is useful for some algorithms such as the
* text stripper, that need to know the width of a space character.
*
* @return The width of a space character.
*/
public float getWidthOfSpace()
{
return widthOfSpace;
}
/**
* @return Returns the xScale.
*/
public float getXScale()
{
return textMatrix.getScalingFactorX();
}
/**
* @return Returns the yScale.
*/
public float getYScale()
{
return textMatrix.getScalingFactorY();
}
/**
* Get the widths of each individual character.
*
* @return An array that is the same length as the length of the string.
*/
public float[] getIndividualWidths()
{
return widths;
}
/**
* Determine if this TextPosition logically contains another (i.e. they overlap and should be
* rendered on top of each other).
*
* @param tp2 The other TestPosition to compare against
* @return True if tp2 is contained in the bounding box of this text.
*/
public boolean contains(TextPosition tp2)
{
double thisXstart = getXDirAdj();
double thisXend = getXDirAdj() + getWidthDirAdj();
double tp2Xstart = tp2.getXDirAdj();
double tp2Xend = tp2.getXDirAdj() + tp2.getWidthDirAdj();
// no X overlap at all so return as soon as possible
if (tp2Xend <= thisXstart || tp2Xstart >= thisXend)
{
return false;
}
// no Y overlap at all so return as soon as possible. Note: 0.0 is in the upper left and
// y-coordinate is top of TextPosition
if (tp2.getYDirAdj() + tp2.getHeightDir() < getYDirAdj() ||
tp2.getYDirAdj() > getYDirAdj() + getHeightDir())
{
return false;
}
// we're going to calculate the percentage of overlap, if its less than a 15% x-coordinate
// overlap then we'll return false because its negligible, .15 was determined by trial and
// error in the regression test files
else if (tp2Xstart > thisXstart && tp2Xend > thisXend)
{
double overlap = thisXend - tp2Xstart;
double overlapPercent = overlap/getWidthDirAdj();
return overlapPercent > .15;
}
else if (tp2Xstart < thisXstart && tp2Xend < thisXend)
{
double overlap = tp2Xend - thisXstart;
double overlapPercent = overlap/getWidthDirAdj();
return overlapPercent > .15;
}
return true;
}
/**
* Merge a single character TextPosition into the current object. This is to be used only for
* cases where we have a diacritic that overlaps an existing TextPosition. In a graphical
* display, we could overlay them, but for text extraction we need to merge them. Use the
* contains() method to test if two objects overlap.
*
* @param diacritic TextPosition to merge into the current TextPosition.
*/
public void mergeDiacritic(TextPosition diacritic)
{
if (diacritic.getUnicode().length() > 1)
{
return;
}
float diacXStart = diacritic.getXDirAdj();
float diacXEnd = diacXStart + diacritic.widths[0];
float currCharXStart = getXDirAdj();
int strLen = unicode.length();
boolean wasAdded = false;
for (int i = 0; i < strLen && !wasAdded; i++)
{
if (i >= widths.length)
{
Log.i("PdfBox-Android", "diacritic " + diacritic.getUnicode() + " on ligature " +
unicode + " is not supported yet and is ignored (PDFBOX-2831)");
break;
}
float currCharXEnd = currCharXStart + widths[i];
// this is the case where there is an overlap of the diacritic character with the
// current character and the previous character. If no previous character, just append
// the diacritic after the current one
if (diacXStart < currCharXStart && diacXEnd <= currCharXEnd)
{
if (i == 0)
{
insertDiacritic(i, diacritic);
}
else
{
float distanceOverlapping1 = diacXEnd - currCharXStart;
float percentage1 = distanceOverlapping1/widths[i];
float distanceOverlapping2 = currCharXStart - diacXStart;
float percentage2 = distanceOverlapping2/widths[i - 1];
if (percentage1 >= percentage2)
{
insertDiacritic(i, diacritic);
}
else
{
insertDiacritic(i - 1, diacritic);
}
}
wasAdded = true;
}
// diacritic completely covers this character and therefore we assume that this is the
// character the diacritic belongs to
else if (diacXStart < currCharXStart && diacXEnd > currCharXEnd)
{
insertDiacritic(i, diacritic);
wasAdded = true;
}
// otherwise, The diacritic modifies this character because its completely
// contained by the character width
else if (diacXStart >= currCharXStart && diacXEnd <= currCharXEnd)
{
insertDiacritic(i, diacritic);
wasAdded = true;
}
// last character in the TextPosition so we add diacritic to the end
else if (diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == strLen - 1)
{
insertDiacritic(i, diacritic);
wasAdded = true;
}
// couldn't find anything useful so we go to the next character in the TextPosition
currCharXStart += widths[i];
}
}
/**
* Inserts the diacritic TextPosition to the str of this TextPosition and updates the widths
* array to include the extra character width.
*
* @param i current character
* @param diacritic The diacritic TextPosition
*/
private void insertDiacritic(int i, TextPosition diacritic)
{
StringBuilder sb = new StringBuilder();
sb.append(unicode.substring(0, i));
float[] widths2 = new float[widths.length + 1];
System.arraycopy(widths, 0, widths2, 0, i);
// Unicode combining diacritics always go after the base character, regardless of whether
// the string is in presentation order or logical order
sb.append(unicode.charAt(i));
widths2[i] = widths[i];
sb.append(combineDiacritic(diacritic.getUnicode()));
widths2[i + 1] = 0;
// get the rest of the string
sb.append(unicode.substring(i + 1, unicode.length()));
System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);
unicode = sb.toString();
widths = widths2;
}
/**
* Combine the diacritic, for example, convert non-combining diacritic characters to their
* combining counterparts.
*
* @param str String to normalize
* @return Normalized string
*/
private String combineDiacritic(String str)
{
// Unicode contains special combining forms of the diacritic characters which we want to use
int codePoint = str.codePointAt(0);
// convert the characters not defined in the Unicode spec
if (DIACRITICS.containsKey(codePoint))
{
return DIACRITICS.get(codePoint);
}
else
{
return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
}
}
/**
* @return True if the current character is a diacritic char.
*/
public boolean isDiacritic()
{
String text = this.getUnicode();
if (text.length() != 1)
{
return false;
}
int type = Character.getType(text.charAt(0));
return type == Character.NON_SPACING_MARK ||
type == Character.MODIFIER_SYMBOL ||
type == Character.MODIFIER_LETTER;
}
/**
* Show the string data for this text position.
*
* @return A human readable form of this object.
*/
@Override
public String toString()
{
return getUnicode();
}
}