/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code is based on the PDFTextStripper written by Ben Litchfield from * the PDFbox 0.7.x project and licensed under the BSD license. In accordance * with the terms of this license, the following copyright statement is retained: * * Copyright (c) 2003-2007, www.pdfbox.org * All rights reserved. * * Furthermore the modified code is re-licensed under the Apache License, * Version 2.0 as stated above. */ package de.tudarmstadt.ukp.dkpro.core.io.pdf; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Vector; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.exceptions.InvalidPasswordException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; import org.apache.pdfbox.util.PDFStreamEngine; import org.apache.pdfbox.util.ResourceLoader; import org.apache.pdfbox.util.TextPosition; /** * This class will take a PDF document and strip out all of the text and ignore the formatting and * such. Please note; it is up to clients of this class to verify that a specific user has the * correct permissions to extract text from the PDF document. * <p> * This class is based on the pdfbox 1.7.0 PDFTextStripper class and was substantially modified and * enhanced for basic paragraph and heading detection. Unfortunately it was not possible to add * these enhancements through sub-classing, thus the code was copied and adapted. */ public abstract class PdfLayoutEventStripper extends PDFStreamEngine { public static enum Values { LEFT, RIGHT, TOP, BOTTOM, LINESPACING, LINEHEIGHT } public static enum Style { PAGE, PARAGRAPH, HEADING } private PDDocument document; private int currentPageNo = 0; private int startPage = 1; private int maxPage = 0; private int endPage = Integer.MAX_VALUE; private boolean suppressDuplicateOverlappingText = true; private boolean shouldSeparateByBeads = true; private List<PDThreadBead> pageArticles = null; /** * The charactersByArticle is used to extract text by article divisions. For example a PDF that * has two columns like a newspaper, we want to extract the first column and then the second * column. In this example the PDF would have 2 beads(or articles), one for each column. The * size of the charactersByArticle would be 5, because not all text on the screen will fall into * one of the articles. The five divisions are shown below * * Text before first article first article text text between first article and second article * second article text text after second article * * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. */ protected Vector<List<TextPosition>> charactersByArticle = new Vector<List<TextPosition>>(); private final Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>(); /** * Instantiate a new PDFTextStripper object. This object will load properties from * Resources/PDFTextStripper.properties. * * @throws IOException * If there is an error loading the properties. */ public PdfLayoutEventStripper() throws IOException { super(ResourceLoader.loadProperties( "org/apache/pdfbox/resources/PDFTextStripper.properties", true)); } /** * Instantiate a new PDFTextStripper object. Loading all of the operator mappings from the * properties object that is passed in. * * @param props * The properties containing the mapping of operators to PDFOperator classes. * * @throws IOException * If there is an error reading the properties. */ public PdfLayoutEventStripper(final Properties props) throws IOException { super(props); } /** * This will take a PDDocument and write the text of that document to the print writer. * * @param doc * The document to get the data from. * * @throws IOException * If the doc is in an invalid state. */ public void writeText(final PDDocument doc) throws IOException { resetEngine(); currentPageNo = 0; document = doc; startDocument(document); if (document.isEncrypted()) { // We are expecting non-encrypted documents here, but it is common // for users to pass in a document that is encrypted with an empty // password (such a document appears to not be encrypted by // someone viewing the document, thus the confusion). We will // attempt to decrypt with the empty password to handle this case. // try { document.decrypt(""); } catch (CryptographyException e) { throw new IOException("Error decrypting document, details: ", e); } catch (InvalidPasswordException e) { throw new IOException("Error: document is encrypted", e); } } processPages(document.getDocumentCatalog().getAllPages()); endDocument(document); } /** * This will process all of the pages and the text that is in them. * * @param pages * The pages object in the document. * * @throws IOException * If there is an error parsing the text. */ protected void processPages(List<PDPage> pages) throws IOException { maxPage = pages.size(); for (final PDPage page : pages) { currentPageNo++; final PDStream contentStream = page.getContents(); if (contentStream != null) { final COSStream contents = contentStream.getStream(); processPage(page, contents); } } } /** * This will process the contents of a page. * * @param page * The page to process. * @param content * The contents of the page. * * @throws IOException * If there is an error processing the page. */ protected void processPage(final PDPage page, final COSStream content) throws IOException { if ((currentPageNo >= startPage) && (currentPageNo <= endPage)) { startPage(startPage, Math.min(maxPage, endPage), currentPageNo, page); pageArticles = page.getThreadBeads(); int numberOfArticleSections = 1 + pageArticles.size() * 2; if (!shouldSeparateByBeads) { numberOfArticleSections = 1; } final int originalSize = charactersByArticle.size(); charactersByArticle.setSize(numberOfArticleSections); for (int i = 0; i < numberOfArticleSections; i++) { if (numberOfArticleSections < originalSize) { charactersByArticle.get(i).clear(); } else { charactersByArticle.set(i, new ArrayList<TextPosition>()); } } characterListMapping.clear(); // processStream will call showCharacter were we will simply // collect all the TextPositions for the page processStream(page, page.findResources(), content); // Now we do the real processing for (int i = 0; i < charactersByArticle.size(); i++) { processArticle(charactersByArticle.get(i)); } endPage(startPage, endPage, currentPageNo, page); } } /** * This method tries do detect headings and paragraphs and line boundaries. * * @param textList * the text. * @throws IOException * if there is an error writing to the stream. */ protected void processArticle(final List<TextPosition> textList) throws IOException { // Nothing to do in this article? if (textList.size() == 0) { return; } // System.out.println("XScale: "+textList.get(0).getXScale()); // System.out.println("YScale: "+textList.get(0).getYScale()); final int prediction_depth = 10; Prediction pred = null; final Block block = new Block(textList, 0); Line currentLine = null; boolean newRegion = false; Style currentStyle = null; Style prevStyle = null; int cur = 0; while (cur < textList.size()) { // Initialize the line (if not already done) if (currentLine == null) { currentLine = new Line(textList, cur); // Get the style for the line (base on style for current // element) prevStyle = currentStyle; currentStyle = getStyle(textList.get(cur)); // Test for a style change if ((newRegion) || (prevStyle != currentStyle)) { if (newRegion) { newRegion = false; } // On a style change issue the proper events if (prevStyle != null) { endRegion(prevStyle); } startRegion(currentStyle); pred = predictGeneralStructure(textList, cur, prediction_depth); } } // Check if we left the line if (!currentLine.withinLine(textList.get(cur)) && !currentLine.isSuperscript(textList.get(cur)) && !currentLine.isSubscript(textList.get(cur))) { // We left the line currentLine = null; // Check if we left the region final boolean columnSwitch = isColumnSwitch(textList.get(cur), block); final boolean leftIndented = isLeftIndented(textList.get(cur), pred); final boolean leftOutdented = isLeftOutdented(textList.get(cur), pred); // boolean fontSwitch = (fontSize[cur] != fontSize[cur-1]); final boolean vAdjacent = isVerticallyAdjacent(textList.get(cur).getY(), textList .get(cur - 1).getY(), block.linespacing); if (!columnSwitch && !leftIndented && !leftOutdented && /* !fontSwitch && */vAdjacent) { // Same region. Issue a line separator and restart processLineSeparator(); } else { // New region newRegion = true; block.reset(cur); if ((pred == null) || !vAdjacent) { pred = predictGeneralStructure(textList, cur, prediction_depth); } else if (vAdjacent) { // If the block is directly adjacent, we may be better // of // with the old prediction... let's see if we can get a // comparatively good new one. final Prediction new_pred = predictGeneralStructure(textList, cur, prediction_depth); final boolean badPred = isSignifiantlyWorse(new_pred.quality, pred.quality, 0.4); if (!badPred) { pred = new_pred; } } } continue; // Start again to create a new currentLine } // Ok, we are in the same line still. // Let's check if the block is adjacent or needs a space // if (!isRightAdjacent(textList, cur, cur-1, cur-2)) { if ((cur > 0) && !isNextChar(textList.get(cur), textList.get(cur - 1))) { processWordSeparator(); } // Grow the current block to calculate better spacings. block.grow(cur); // Write of the characters and advance. writeCharacters(textList.get(cur)); cur++; } // Close region if (currentStyle != null) { endRegion(currentStyle); } } /** * This will show add a character to the list of characters to be printed to the text file. * * @param text * The description of the character to display. */ @Override protected void processTextPosition(final TextPosition text) { boolean showCharacter = true; if (suppressDuplicateOverlappingText) { showCharacter = false; final String textCharacter = text.getCharacter(); final float textX = text.getX(); final float textY = text.getY(); List<TextPosition> sameTextCharacters = characterListMapping.get(textCharacter); if (sameTextCharacters == null) { sameTextCharacters = new ArrayList<TextPosition>(); characterListMapping.put(textCharacter, sameTextCharacters); } // RDD - Here we compute the value that represents the end of the // rendered // text. This value is used to determine whether subsequent text // rendered // on the same line overwrites the current text. // // We subtract any positive padding to handle cases where extreme // amounts // of padding are applied, then backed off (not sure why this is // done, but there // are cases where the padding is on the order of 10x the character // width, and // the TJ just backs up to compensate after each character). Also, // we subtract // an amount to allow for kerning (a percentage of the width of the // last // character). // boolean suppressCharacter = false; final float tolerance = (text.getWidth() / textCharacter.length()) / 3.0f; for (int i = 0; i < sameTextCharacters.size() && textCharacter != null; i++) { final TextPosition character = sameTextCharacters.get(i); final String charCharacter = character.getCharacter(); final float charX = character.getX(); final float charY = character.getY(); // only want to suppress if (charCharacter != null && // charCharacter.equals( textCharacter ) && within(charX, textX, tolerance) && within(charY, textY, tolerance)) { suppressCharacter = true; } } if (!suppressCharacter && (text.getCharacter() != null) && (text.getCharacter().length() > 0)) { sameTextCharacters.add(text); showCharacter = true; } } if (showCharacter) { // if we are showing the character then we need to determine which // article it belongs to. int foundArticleDivisionIndex = -1; int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; int notFoundButFirstLeftArticleDivisionIndex = -1; int notFoundButFirstAboveArticleDivisionIndex = -1; final float x = text.getX(); final float y = text.getY(); if (shouldSeparateByBeads) { for (int i = 0; i < pageArticles.size() && foundArticleDivisionIndex == -1; i++) { final PDThreadBead bead = pageArticles.get(i); if (bead != null) { final PDRectangle rect = bead.getRectangle(); if (rect.contains(x, y)) { foundArticleDivisionIndex = i * 2 + 1; } else if ((x < rect.getLowerLeftX() || y < rect.getUpperRightY()) && notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) { notFoundButFirstLeftAndAboveArticleDivisionIndex = i * 2; } else if (x < rect.getLowerLeftX() && notFoundButFirstLeftArticleDivisionIndex == -1) { notFoundButFirstLeftArticleDivisionIndex = i * 2; } else if (y < rect.getUpperRightY() && notFoundButFirstAboveArticleDivisionIndex == -1) { notFoundButFirstAboveArticleDivisionIndex = i * 2; } } else { foundArticleDivisionIndex = 0; } } } else { foundArticleDivisionIndex = 0; } int articleDivisionIndex = -1; if (foundArticleDivisionIndex != -1) { articleDivisionIndex = foundArticleDivisionIndex; } else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; } else if (notFoundButFirstLeftArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; } else if (notFoundButFirstAboveArticleDivisionIndex != -1) { articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; } else { articleDivisionIndex = charactersByArticle.size() - 1; } final List<TextPosition> textList = charactersByArticle.get(articleDivisionIndex); textList.add(text); } } /** * This will determine of two floating point numbers are within a specified variance. * * @param first * The first number to compare to. * @param second * The second number to compare to. * @param variance * The allowed variance. * @return if the number is within the specified variance. */ private static boolean within(final float first, final float second, final float variance) { return second > first - variance && second < first + variance; } private static float getWordSpacing(final TextPosition position) { if (position == null) { return 0; } float wordSpacing = 0; if (wordSpacing == 0) { // try to get width of a space character wordSpacing = position.getWidthOfSpace(); // if still zero fall back to getting the width of the current // character if (wordSpacing == 0) { wordSpacing = position.getWidth(); } } return wordSpacing; } private static boolean validPosition(final List<TextPosition> textList, final int pos) { return (pos >= 0) && (pos < textList.size()); } /** * Detects whether text in two positions is on the same line. This method is a bit fuzzy so we * also get potential superscripts and subscripts. * * @param cur current position. * @param prev previous position. * @return if both are in the same line. */ private static boolean isSameLine(final TextPosition cur, final TextPosition prev) { if (cur.getY() == prev.getY()) { return true; } else { final float prevCenter = prev.getY() + prev.getHeight() / 2.0f; final float prevHeight = prev.getHeight(); final float curCenter = cur.getY() + cur.getHeight() / 2.0f; final boolean result = Math.abs(curCenter - prevCenter) < (prevHeight * 0.25f); // if (!result) { // _log.debug("sameLine ["+result+"]"+ // "[px:"+f_y1[prev]+"-"+f_y2[prev]+":"+contents[prev]+"]"+ // "[cx:"+f_y1[cur]+"-"+f_y2[cur]+":"+contents[cur]+"]"); // } return result; } } /** * Tests if two objects are vertically adjacent or if they are so far away from each other that * they have to be considered different blocks. * * @param cur_top * current top. * @param prev_top * previous top. * @param spacing * spacing. * @return if the two objects are verticalla adjacent. */ private static boolean isVerticallyAdjacent(final float cur_top, final float prev_top, final float spacing) { /* set vertical error margin */ final float verterr = (float) (spacing * 1.27); final boolean aboveThreshold = (cur_top < (prev_top + verterr)); final boolean belowprev = (cur_top > prev_top); return aboveThreshold && belowprev; } private static boolean isLeftIndented(final TextPosition cur, final Prediction pred) { return cur.getX() > (pred.left + (pred.linespacing * 0.2)); } private static boolean isLeftOutdented(final TextPosition cur, final Prediction pred) { return cur.getX() < (pred.left - (pred.linespacing * 0.2)); } /** * Check if the current fragment is in a new column. * * @param cur * current text position. * @param block * current block. * @return if the fragment is in a new column. */ private static boolean isColumnSwitch(final TextPosition cur, final Block block) { return (cur.getY() < block.top); // && (f_x1[cur] > block.right); } private static boolean isSignifiantlyWorse(final double qnew, final double qold, final double limit) { final double deviation = Math.abs(((qnew - qold) / (qnew + qold))); final boolean result = (deviation > limit) && (qnew < qold); // if (_log.isTraceEnabled()) { // _log.trace("Deviation: "+deviation+ " - "+(result?"BAD":"OK")); // } return result; } /** * Determine whether we need to insert a word separator between the two positions or not. * * Adapted from PDFBox PDFTextStripper.flushText() * * @param cur * current position. * @param prev * previous position. * @return if the two positions are immediately adjacent. */ private static boolean isNextChar(final TextPosition cur, final TextPosition prev) { float lastWordSpacing = getWordSpacing(prev); final float wordSpacing = getWordSpacing(cur); float startOfNextWordX; final float endOfLastTextX = prev.getX() + prev.getWidth(); // RDD - We add a conservative approximation for space determination. // basically if there is a blank area between two characters that is // equal to some percentage of the word spacing then that will be the // start of the next word if (lastWordSpacing <= 0) { startOfNextWordX = endOfLastTextX + (wordSpacing * 0.50f); } else { startOfNextWordX = endOfLastTextX + (((wordSpacing + lastWordSpacing) / 2f) * 0.50f); } lastWordSpacing = wordSpacing; // if (startOfNextWordX > cur.getX()) { // System.out.print("{O:"+(startOfNextWordX - cur.getX())+"}"); // } if (startOfNextWordX != -1 && startOfNextWordX < cur.getX() && prev != null && // only bother adding a space if the last character was not a // space prev.getCharacter() != null && !prev.getCharacter().endsWith(" ")) { return false; } else { return true; } } private List<Line> collectLines(final List<TextPosition> textList, final int blk_start, final int depth) { final ArrayList<Line> lines = new ArrayList<Line>(depth); Line l = new Line(textList, blk_start); lines.add(l); for (int i = 1; i < depth && l.hasNextLine(); i++) { l = l.getNextLine(); // Bail out if we have a potential column switch if (l.top < lines.get(lines.size() - 1).bottom) { break; } lines.add(l); } return lines; } /** * Return a block with the probable linespacing, lineheight and left and right borders. * * @param textList * text. * @param blk_start * block start. * @param depth * depth. * @return structure prediction. */ private Prediction predictGeneralStructure(final List<TextPosition> textList, final int blk_start, final int depth) { // Try to fetch the next lines up to depth final List<Line> lines = collectLines(textList, blk_start, depth); // Calculate the line block parameters LineBlock lb = new LineBlock(lines); // Iterate once more over the lines because we may have a big spacing // indicating a new block. final List<Line> lines2 = new ArrayList<Line>(depth); final Line l = lines.get(0); lines2.add(l); for (int i = 1; i < lines.size(); i++) { // Bail out if we have too much distance if (!isVerticallyAdjacent(lines.get(i).top, lines.get(i - 1).top, lb.linespacing)) { break; } lines2.add(lines.get(i)); } // Get the bounds in buckets final Buckets left_buckets = new Buckets(lb.linespacing * 0.1); final Buckets right_buckets = new Buckets(lb.linespacing * 0.1); for (final Line ln : lines2) { left_buckets.put(ln.left); right_buckets.put(ln.right); } // if (_log.isTraceEnabled()) { // _log.trace("Left: size:"+left_buckets.getBest().size()+" - lines:"+lines2.size()+" - depth:"+depth); // } lb = new LineBlock(lines2); // Return values final Prediction result = new Prediction(); result.linespacing = lb.linespacing; result.lineheight = lb.avglineheight; result.left = (float) left_buckets.getBest().getValue(); result.right = (float) right_buckets.getBest().getValue(); result.quality = (float) left_buckets.getBest().size() / (float) depth; return result; } protected Style getStyle(final TextPosition pos) { if ((pos.getFontSize() * pos.getYScale()) > 14) { return Style.HEADING; } else { return Style.PARAGRAPH; } } /** * This method is available for subclasses of this class. It will be called before processing of * the document start. * * @param pdf * The PDF document that is being processed. * @throws IOException * If an IO error occurs. */ protected abstract void startDocument(PDDocument pdf) throws IOException; /** * This method is available for subclasses of this class. It will be called after processing of * the document finishes. * * @param pdf * The PDF document that is being processed. * @throws IOException * If an IO error occurs. */ protected abstract void endDocument(PDDocument pdf) throws IOException; /** * Start a new region. * * @param style * the style. * @throws IOException * If there is any error writing to the stream. */ protected abstract void startRegion(Style style) throws IOException; /** * End a region. * * @param style * the style. * @throws IOException * If there is any error writing to the stream. */ protected abstract void endRegion(Style style) throws IOException; /** * Start a new page. * * @param firstPage * first page. * @param lastPage * last page. * @param currentPage * current page. * @param page * The page we are about to process. * * @throws IOException * If there is any error writing to the stream. */ protected abstract void startPage(int firstPage, int lastPage, int currentPage, PDPage page) throws IOException; /** * End a page. * * @param firstPage * first page. * @param lastPage * last page. * @param currentPage * current page. * @param page * The page we are about to process. * * @throws IOException * If there is any error writing to the stream. */ protected abstract void endPage(int firstPage, int lastPage, int currentPage, PDPage page) throws IOException; protected abstract void processLineSeparator() throws IOException; protected abstract void processWordSeparator() throws IOException; /** * Write the string to the output stream. * * @param text * The text to write to the stream. * @throws IOException * If there is an error when writing the text. */ protected abstract void writeCharacters(TextPosition text) throws IOException; /** * This is the page that the text extraction will start on. The pages start at page 1. For * example in a 5 page PDF document, if the start page is 1 then all pages will be extracted. If * the start page is 4 then pages 4 and 5 will be extracted. The default value is 1. * * @return Value of property startPage. */ public int getStartPage() { return startPage; } /** * This will set the first page to be extracted by this class. * * @param startPageValue * New value of property startPage. */ public void setStartPage(final int startPageValue) { startPage = startPageValue; } /** * This will get the last page that will be extracted. This is inclusive, for example if a 5 * page PDF an endPage value of 5 would extract the entire document, an end page of 2 would * extract pages 1 and 2. This defaults to Integer.MAX_VALUE such that all pages of the pdf will * be extracted. * * @return Value of property endPage. */ public int getEndPage() { return endPage; } /** * This will set the last page to be extracted by this class. * * @param endPageValue * New value of property endPage. */ public void setEndPage(final int endPageValue) { endPage = endPageValue; } /** * @return Returns the suppressDuplicateOverlappingText. */ public boolean shouldSuppressDuplicateOverlappingText() { return suppressDuplicateOverlappingText; } /** * Get the current page number that is being processed. * * @return A 1 based number representing the current page. */ protected int getCurrentPageNo() { return currentPageNo; } /** * Character strings are grouped by articles. It is quite common that there will only be a * single article. This returns a List that contains List objects, the inner lists will contain * TextPosition objects. * * @return A double List of TextPositions for all text strings on the page. */ protected List<List<TextPosition>> getCharactersByArticle() { return charactersByArticle; } /** * By default the text stripper will attempt to remove text that overlapps each other. Word * paints the same character several times in order to make it look bold. By setting this to * false all text will be extracted, which means that certain sections will be duplicated, but * better performance will be noticed. * * @param suppressDuplicateOverlappingTextValue * The suppressDuplicateOverlappingText to set. */ public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingTextValue) { this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; } /** * This will tell if the text stripper should separate by beads. * * @return If the text will be grouped by beads. */ public boolean shouldSeparateByBeads() { return shouldSeparateByBeads; } /** * Set if the text stripper should group the text output by a list of beads. The default value * is true! * * @param aShouldSeparateByBeads * The new grouping of beads. */ public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) { this.shouldSeparateByBeads = aShouldSeparateByBeads; } static class LineBlock { final List<Line> lines; final float linespacing; final float avglineheight; LineBlock(final List<Line> ls) { lines = ls; linespacing = calcLinespacing(); avglineheight = calcAvgLineheight(); } float calcLinespacing() { if (lines.size() == 1) { return Math.abs(lines.get(0).top - lines.get(0).bottom); } float avgls = 0.0f; for (int i = 0; i < (lines.size() - 1); i++) { avgls += Math.abs(lines.get(i).top - lines.get(i + 1).top); } return avgls / (lines.size() - 1); } private float calcAvgLineheight() { float avglh = 0.0f; for (final Line l : lines) { avglh += l.lineheight; } return avglh / lines.size(); } } static class Prediction { float lineheight; float linespacing; float left; float right; float quality; } static class Line extends BasicBlock { final int start; final int end; final float lineheight; Line(final List<TextPosition> tl, final int pos) { super(tl); start = pos; end = findEnd(); lineheight = growAndCalcLineheight(); } private float growAndCalcLineheight() { float h = textList.get(start).getHeight(); reset(start); for (int i = start + 1; i < end; i++) { h = Math.max(h, textList.get(i).getHeight()); grow(i); } return h; } private int findEnd() { int cur = start; while (validPosition(textList, cur) && isSameLine(textList.get(cur), textList.get(start))) { cur++; } return cur; } boolean hasNextLine() { return validPosition(textList, end); } Line getNextLine() { if (hasNextLine()) { return new Line(textList, end); } else { return null; } } /** * Return true if the text position is within the line height boundaries. Left and right * boundaries are not checked. * * @param pos * text position. * @return if the position is within the line. */ boolean withinLine(final TextPosition pos) { final boolean underTop = top <= pos.getY(); final boolean overBottom = (pos.getY() + pos.getHeight()) <= bottom; return underTop && overBottom; } boolean isSuperscript(final TextPosition pos) { final boolean underTop = (top - lineheight * 0.6f) <= pos.getY(); final boolean overBottom = (pos.getY() + pos.getHeight()) <= bottom; return underTop && overBottom; } boolean isSubscript(final TextPosition pos) { final boolean underTop = (top <= pos.getY()); final boolean overBottom = (pos.getY() + pos.getHeight() + lineheight * 0.6f) <= bottom; return underTop && overBottom; } @Override public String toString() { return "[t:" + top + " b:" + bottom + "|" + content + "]"; } } static class BasicBlock { float left; float top; float right; float bottom; int lines; int last_pos; final List<TextPosition> textList; // This is for debugging purposes only. final StringBuilder content = new StringBuilder(); public BasicBlock(final List<TextPosition> tl) { textList = tl; } float getValue(final Values v) { switch (v) { case BOTTOM: return bottom; case TOP: return top; case RIGHT: return right; case LEFT: return left; default: throw new IllegalArgumentException("Unsupported value"); } } void normalize() { if (top < bottom) { final float b = top; top = bottom; bottom = b; } if (left > right) { final float l = left; left = right; right = l; } } void reset(final int pos) { final TextPosition p = textList.get(pos); last_pos = pos; lines = 0; left = p.getX(); right = p.getX() + p.getWidth(); top = p.getY(); bottom = p.getY() + p.getHeight(); content.setLength(0); content.append(p.getCharacter()); } void grow(final int pos) { final TextPosition p = textList.get(pos); if (!isSameLine(p, textList.get(last_pos))) { lines++; } last_pos = pos; left = Math.min(p.getX(), left); right = Math.max(p.getX() + p.getWidth(), right); top = Math.min(p.getY(), top); bottom = Math.max(p.getY() + p.getHeight(), bottom); content.append(" "); content.append(p.getCharacter()); } } class Block extends BasicBlock { float linespacing; float lineheight; Block(final List<TextPosition> textList, final int pos) { super(textList); reset(pos); } @Override void reset(final int pos) { super.reset(pos); linespacing = new LineBlock(collectLines(textList, pos, 3)).linespacing; lineheight = Math.abs(bottom - top); } @Override void grow(final int pos) { super.grow(pos); lineheight = Math.max(lineheight, textList.get(pos).getHeight()); } } }