TextPosition.java example

Explorer
PdfBox-Android-master
- library
  - src
- sample
  - src
    - main
      - java
        com
        tom_roush
        pdfbox
        sample
        MainActivity.java
package com.tom_roush.pdfbox.text;

import android.util.Log;

import com.tom_roush.pdfbox.pdmodel.font.PDFont;
import com.tom_roush.pdfbox.util.Matrix;

import java.text.Normalizer;
import java.util.HashMap;
import java.util.Map;

/**
 * This represents a string and a position on the screen of those characters.
 *
 * @author Ben Litchfield
 */
public final class TextPosition
{
	private static final Map<Integer, String> DIACRITICS = createDiacritics();
	// Adds non-decomposing diacritics to the hash with their related combining character.
	// These are values that the unicode spec claims are equivalent but are not mapped in the form
	// NFKC normalization method. Determined by going through the Combining Diacritical Marks
	// section of the Unicode spec and identifying which characters are not mapped to by the
	// normalization.
	private static Map<Integer, String> createDiacritics()
	{
		HashMap<Integer, String> map = new HashMap<Integer, String>();
		map.put(0x0060, "\u0300");
		map.put(0x02CB, "\u0300");
		map.put(0x0027, "\u0301");
		map.put(0x02B9, "\u0301");
		map.put(0x02CA, "\u0301");
		map.put(0x005e, "\u0302");
		map.put(0x02C6, "\u0302");
		map.put(0x007E, "\u0303");
		map.put(0x02C9, "\u0304");
		map.put(0x00B0, "\u030A");
		map.put(0x02BA, "\u030B");
		map.put(0x02C7, "\u030C");
		map.put(0x02C8, "\u030D");
		map.put(0x0022, "\u030E");
		map.put(0x02BB, "\u0312");
		map.put(0x02BC, "\u0313");
		map.put(0x0486, "\u0313");
		map.put(0x055A, "\u0313");
		map.put(0x02BD, "\u0314");
		map.put(0x0485, "\u0314");
		map.put(0x0559, "\u0314");
		map.put(0x02D4, "\u031D");
		map.put(0x02D5, "\u031E");
		map.put(0x02D6, "\u031F");
		map.put(0x02D7, "\u0320");
		map.put(0x02B2, "\u0321");
		map.put(0x02CC, "\u0329");
		map.put(0x02B7, "\u032B");
		map.put(0x02CD, "\u0331");
		map.put(0x005F, "\u0332");
		map.put(0x204E, "\u0359");
		return map;
	}

	// text matrix for the start of the text object, coordinates are in display units
	// and have not been adjusted
	private final Matrix textMatrix;

	// ending X and Y coordinates in display units
	private final float endX;
	private final float endY;

	private final float maxHeight; // maximum height of text, in display units
	private final int rotation; // 0, 90, 180, 270 degrees of page rotation
	private final float x;
	private final float y;
	private final float pageHeight;
	private final float pageWidth;

	private final float widthOfSpace; // width of a space, in display units

	private final int[] charCodes; // internal PDF character codes
	private final PDFont font;
	private final float fontSize;
	private final int fontSizePt;

	// mutable
	private float[] widths;
	private String unicode;

	/**
	 * Constructor.
	 *
	 * @param pageRotation rotation of the page that the text is located in
	 * @param pageWidth rotation of the page that the text is located in
	 * @param pageHeight rotation of the page that the text is located in
	 * @param textMatrix TextMatrix for start of text (in display units)
	 * @param endX x coordinate of the end position
	 * @param endY y coordinate of the end position
	 * @param maxHeight Maximum height of text (in display units)
	 * @param individualWidth The width of the given character/string. (in text units)
	 * @param spaceWidth The width of the space character. (in display units)
	 * @param unicode The string of Unicode characters to be displayed.
	 * @param charCodes An array of the internal PDF character codes for the glyphs in this text.
	 * @param font The current font for this text position.
	 * @param fontSize The new font size.
	 * @param fontSizeInPt The font size in pt units.
	 */
	public TextPosition(int pageRotation, float pageWidth, float pageHeight, Matrix textMatrix,
			float endX, float endY, float maxHeight, float individualWidth,
			float spaceWidth, String unicode, int[] charCodes, PDFont font,
			float fontSize, int fontSizeInPt)
	{
		this.textMatrix = textMatrix;

		this.endX = endX;
		this.endY = endY;

		int rotationAngle = pageRotation;
		this.rotation = rotationAngle;

		this.maxHeight = maxHeight;
		this.pageHeight = pageHeight;
		this.pageWidth = pageWidth;

		this.widths = new float[] { individualWidth };
		this.widthOfSpace = spaceWidth;
		this.unicode = unicode;
		this.charCodes = charCodes;
		this.font = font;
		this.fontSize = fontSize;
		this.fontSizePt = fontSizeInPt;

		x = getXRot(rotationAngle);
		if (rotationAngle == 0 || rotationAngle == 180)
		{
			y = this.pageHeight - getYLowerLeftRot(rotationAngle);
		}
		else
		{
			y = this.pageWidth - getYLowerLeftRot(rotationAngle);
		}
	}

	/**
	 * Return the string of characters stored in this object.
	 *
	 * @return The string on the screen.
	 */
	public String getUnicode()
	{
		return unicode;
	}

	/**
	 * Return the internal PDF character codes of the glyphs in this text.
	 *
	 * @return an array of internal PDF character codes
	 */
	public int[] getCharacterCodes()
	{
		return charCodes;
	}

	/**
	 * Return the text matrix stored in this object.
	 *
	 * @return The Matrix containing the starting text position
	 */
	public Matrix getTextMatrix()
	{
		return textMatrix;
	}

	/**
	 * Return the direction/orientation of the string in this object based on its text matrix.
	 * @return The direction of the text (0, 90, 180, or 270)
	 */
	public float getDir()
	{
		float a = textMatrix.getScaleY();
		float b = textMatrix.getShearY();
		float c = textMatrix.getScaleX();
		float d = textMatrix.getShearX();

		// 12 0   left to right
		// 0 12
		if (a > 0 && Math.abs(b) < d && Math.abs(c) < a && d > 0)
		{
			return 0;
		}
		// -12 0   right to left (upside down)
		// 0 -12
		else if (a < 0 && Math.abs(b) < Math.abs(d) && Math.abs(c) < Math.abs(a) && d < 0)
		{
			return 180;
		}
		// 0  12    up
		// -12 0
		else if (Math.abs(a) < Math.abs(c) && b > 0 && c < 0 && Math.abs(d) < b)
		{
			return 90;
		}
		// 0  -12   down
		// 12 0
		else if (Math.abs(a) < c && b < 0 && c > 0 && Math.abs(d) < Math.abs(b))
		{
			return 270;
		}
		return 0;
	}

	/**
	 * Return the X starting coordinate of the text, adjusted by the given rotation amount.
	 * The rotation adjusts where the 0,0 location is relative to the text.
	 *
	 * @param rotation Rotation to apply (0, 90, 180, or 270).  0 will perform no adjustments.
	 * @return X coordinate
	 */
	private float getXRot(float rotation)
	{
		if (rotation == 0)
		{
			return textMatrix.getTranslateX();
		}
		else if (rotation == 90)
		{
			return textMatrix.getTranslateY();
		}
		else if (rotation == 180)
		{
			return pageWidth - textMatrix.getTranslateX();
		}
		else if (rotation == 270)
		{
			return pageHeight - textMatrix.getTranslateX();
		}
		return 0;
	}

	/**
	 * This will get the page rotation adjusted x position of the character.
	 * This is adjusted based on page rotation so that the upper left is 0,0.
	 *
	 * @return The x coordinate of the character.
	 */
	public float getX()
	{
		return x;
	}

	/**
	 * This will get the text direction adjusted x position of the character.
	 * This is adjusted based on text direction so that the first character
	 * in that direction is in the upper left at 0,0.
	 *
	 * @return The x coordinate of the text.
	 */
	public float getXDirAdj()
	{
		return getXRot(getDir());
	}

	/**
	 * This will get the y position of the character with 0,0 in lower left.
	 * This will be adjusted by the given rotation.
	 *
	 * @param rotation Rotation to apply to text to adjust the 0,0 location (0,90,180,270)
	 * @return The y coordinate of the text
	 */
	private float getYLowerLeftRot(float rotation)
	{
		if (rotation == 0)
		{
			return textMatrix.getTranslateY();
		}
		else if (rotation == 90)
		{
			return pageWidth - textMatrix.getTranslateX();
		}
		else if (rotation == 180)
		{
			return pageHeight - textMatrix.getTranslateY();
		}
		else if (rotation == 270)
		{
			return textMatrix.getTranslateX();
		}
		return 0;
	}

	/**
	 * This will get the y position of the text, adjusted so that 0,0 is upper left and it is
	 * adjusted based on the page rotation.
	 *
	 * @return The adjusted y coordinate of the character.
	 */
	public float getY()
	{
		return y;
	}

	/**
	 * This will get the y position of the text, adjusted so that 0,0 is upper left and it is
	 * adjusted based on the text direction.
	 *
	 * @return The adjusted y coordinate of the character.
	 */
	public float getYDirAdj()
	{
		float dir = getDir();
		// some PDFBox code assumes that the 0,0 point is in upper left, not lower left
		if (dir == 0 || dir == 180)
		{
			return pageHeight - getYLowerLeftRot(dir);
		}
		else
		{
			return pageWidth - getYLowerLeftRot(dir);
		}
	}

	/**
	 * Get the length or width of the text, based on a given rotation.
	 *
	 * @param rotation Rotation that was used to determine coordinates (0,90,180,270)
	 * @return Width of text in display units
	 */
	private float getWidthRot(float rotation)
	{
		if (rotation == 90 || rotation == 270)
		{
			return Math.abs(endY - textMatrix.getTranslateY());
		}
		else
		{
			return Math.abs(endX - textMatrix.getTranslateX());
		}
	}

	/**
	 * This will get the width of the string when page rotation adjusted coordinates are used.
	 *
	 * @return The width of the text in display units.
	 */
	public float getWidth()
	{
		return getWidthRot(rotation);
	}

	/**
	 * This will get the width of the string when text direction adjusted coordinates are used.
	 *
	 * @return The width of the text in display units.
	 */
	public float getWidthDirAdj()
	{
		return getWidthRot(getDir());
	}

	/**
	 * This will get the maximum height of all characters in this string.
	 *
	 * @return The maximum height of all characters in this string.
	 */
	public float getHeight()
	{
		return maxHeight;
	}

	/**
	 * This will get the maximum height of all characters in this string.
	 *
	 * @return The maximum height of all characters in this string.
	 */
	public float getHeightDir()
	{
		// this is not really a rotation-dependent calculation, but this is defined for symmetry
		return maxHeight;
	}

	/**
	 * This will get the font size that this object is suppose to be drawn at.
	 *
	 * @return The font size.
	 */
	public float getFontSize()
	{
		return fontSize;
	}

	/**
	 * This will get the font size in pt. To get this size we have to multiply the pdf-fontsize
	 * and the scaling from the textmatrix
	 *
	 * @return The font size in pt.
	 */
	public float getFontSizeInPt()
	{
		return fontSizePt;
	}

	/**
	 * This will get the font for the text being drawn.
	 *
	 * @return The font size.
	 */
	public PDFont getFont()
	{
		return font;
	}

	/**
	 * This will get the width of a space character. This is useful for some algorithms such as the
	 * text stripper, that need to know the width of a space character.
	 *
	 * @return The width of a space character.
	 */
	public float getWidthOfSpace()
	{
		return widthOfSpace;
	}

	/**
	 * @return Returns the xScale.
	 */
	public float getXScale()
	{
		return textMatrix.getScalingFactorX();
	}

	/**
	 * @return Returns the yScale.
	 */
	public float getYScale()
	{
		return textMatrix.getScalingFactorY();
	}

	/**
	 * Get the widths of each individual character.
	 *
	 * @return An array that is the same length as the length of the string.
	 */
	public float[] getIndividualWidths()
	{
		return widths;
	}

	/**
	 * Determine if this TextPosition logically contains another (i.e. they overlap and should be
	 * rendered on top of each other).
	 *
	 * @param tp2 The other TestPosition to compare against
	 * @return True if tp2 is contained in the bounding box of this text.
	 */
	public boolean contains(TextPosition tp2)
	{
		double thisXstart = getXDirAdj();
		double thisXend = getXDirAdj() + getWidthDirAdj();

		double tp2Xstart = tp2.getXDirAdj();
		double tp2Xend = tp2.getXDirAdj() + tp2.getWidthDirAdj();

		// no X overlap at all so return as soon as possible
		if (tp2Xend <= thisXstart || tp2Xstart >= thisXend)
		{
			return false;
		}

		// no Y overlap at all so return as soon as possible. Note: 0.0 is in the upper left and
		// y-coordinate is top of TextPosition
		if (tp2.getYDirAdj() + tp2.getHeightDir() < getYDirAdj() ||
				tp2.getYDirAdj() > getYDirAdj() + getHeightDir())
		{
			return false;
		}
		// we're going to calculate the percentage of overlap, if its less than a 15% x-coordinate
		// overlap then we'll return false because its negligible, .15 was determined by trial and
		// error in the regression test files
		else if (tp2Xstart > thisXstart && tp2Xend > thisXend)
		{
			double overlap = thisXend - tp2Xstart;
			double overlapPercent = overlap/getWidthDirAdj();
			return overlapPercent > .15;
		}
		else if (tp2Xstart < thisXstart && tp2Xend < thisXend)
		{
			double overlap = tp2Xend - thisXstart;
			double overlapPercent = overlap/getWidthDirAdj();
			return overlapPercent > .15;
		}
		return true;
	}

	/**
	 * Merge a single character TextPosition into the current object. This is to be used only for
	 * cases where we have a diacritic that overlaps an existing TextPosition. In a graphical
	 * display, we could overlay them, but for text extraction we need to merge them. Use the
	 * contains() method to test if two objects overlap.
	 *
	 * @param diacritic TextPosition to merge into the current TextPosition.
	 */
	public void mergeDiacritic(TextPosition diacritic)
	{
		if (diacritic.getUnicode().length() > 1)
		{
			return;
		}

		float diacXStart = diacritic.getXDirAdj();
		float diacXEnd = diacXStart + diacritic.widths[0];

		float currCharXStart = getXDirAdj();

		int strLen = unicode.length();
		boolean wasAdded = false;

		for (int i = 0; i < strLen && !wasAdded; i++)
		{
            if (i >= widths.length)
            {
                Log.i("PdfBox-Android", "diacritic " + diacritic.getUnicode() + " on ligature " +
                    unicode + " is not supported yet and is ignored (PDFBOX-2831)");
                break;
            }
            float currCharXEnd = currCharXStart + widths[i];

			// this is the case where there is an overlap of the diacritic character with the
			// current character and the previous character. If no previous character, just append
			// the diacritic after the current one
			if (diacXStart < currCharXStart && diacXEnd <= currCharXEnd)
			{
				if (i == 0)
				{
					insertDiacritic(i, diacritic);
				}
				else
				{
					float distanceOverlapping1 = diacXEnd - currCharXStart;
					float percentage1 = distanceOverlapping1/widths[i];

					float distanceOverlapping2 = currCharXStart - diacXStart;
					float percentage2 = distanceOverlapping2/widths[i - 1];

					if (percentage1 >= percentage2)
					{
						insertDiacritic(i, diacritic);
					}
					else
					{
						insertDiacritic(i - 1, diacritic);
					}
				}
				wasAdded = true;
			}
			// diacritic completely covers this character and therefore we assume that this is the
			// character the diacritic belongs to
			else if (diacXStart < currCharXStart && diacXEnd > currCharXEnd)
			{
				insertDiacritic(i, diacritic);
				wasAdded = true;
			}
			// otherwise, The diacritic modifies this character because its completely
			// contained by the character width
			else if (diacXStart >= currCharXStart && diacXEnd <= currCharXEnd)
			{
				insertDiacritic(i, diacritic);
				wasAdded = true;
			}
			// last character in the TextPosition so we add diacritic to the end
			else if (diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == strLen - 1)
			{
				insertDiacritic(i, diacritic);
				wasAdded = true;
			}

			// couldn't find anything useful so we go to the next character in the TextPosition
			currCharXStart += widths[i];
		}
	}

	/**
	 * Inserts the diacritic TextPosition to the str of this TextPosition and updates the widths
	 * array to include the extra character width.
	 *
	 * @param i current character
	 * @param diacritic The diacritic TextPosition
	 */
	private void insertDiacritic(int i, TextPosition diacritic)
	{
		StringBuilder sb = new StringBuilder();
		sb.append(unicode.substring(0, i));

		float[] widths2 = new float[widths.length + 1];
		System.arraycopy(widths, 0, widths2, 0, i);

		// Unicode combining diacritics always go after the base character, regardless of whether
		// the string is in presentation order or logical order
		sb.append(unicode.charAt(i));
		widths2[i] = widths[i];
		sb.append(combineDiacritic(diacritic.getUnicode()));
		widths2[i + 1] = 0;

		// get the rest of the string
		sb.append(unicode.substring(i + 1, unicode.length()));
		System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);

		unicode = sb.toString();
		widths = widths2;
	}

	/**
	 * Combine the diacritic, for example, convert non-combining diacritic characters to their
	 * combining counterparts.
	 *
	 * @param str String to normalize
	 * @return Normalized string
	 */
	private String combineDiacritic(String str)
	{
		// Unicode contains special combining forms of the diacritic characters which we want to use
		int codePoint = str.codePointAt(0);
		// convert the characters not defined in the Unicode spec
		if (DIACRITICS.containsKey(codePoint))
		{
			return DIACRITICS.get(codePoint);
		}
		else
		{
			return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
		}
	}

	/**
	 * @return True if the current character is a diacritic char.
	 */
	public boolean isDiacritic()
	{
		String text = this.getUnicode();
		if (text.length() != 1)
		{
			return false;
		}
		int type = Character.getType(text.charAt(0));
		return type == Character.NON_SPACING_MARK ||
				type == Character.MODIFIER_SYMBOL ||
				type == Character.MODIFIER_LETTER;
	}

	/**
	 * Show the string data for this text position.
	 *
	 * @return A human readable form of this object.
	 */
	@Override
	public String toString()
	{
		return getUnicode();
	}
}