/* * Copyright 2010-2011 Øyvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract.physical.column; import org.apache.log4j.Logger; import org.elacin.pdfextract.content.PhysicalContent; import org.elacin.pdfextract.content.PhysicalPageRegion; import org.elacin.pdfextract.content.WhitespaceRectangle; import org.elacin.pdfextract.geom.MathUtils; import org.elacin.pdfextract.geom.Rectangle; import org.elacin.pdfextract.geom.Sorting; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import static org.elacin.pdfextract.Constants.*; import static org.elacin.pdfextract.geom.RectangleCollection.Direction.E; import static org.elacin.pdfextract.geom.RectangleCollection.Direction.W; /** * Created by IntelliJ IDEA. User: elacin Date: Sep 23, 2010 Time: 12:54:21 PM To change this * template use File | Settings | File Templates. */ public class ColumnFinder { private static final String chars = "()[]abcdef1234567890o.?* "; // ------------------------------ FIELDS ------------------------------ public static final float DEFAULT_COLUMN_WIDTH = 2.0f; private static final Logger log = Logger.getLogger(ColumnFinder.class); // -------------------------- PUBLIC STATIC METHODS -------------------------- @NotNull public static List<WhitespaceRectangle> extractColumnBoundaries(@NotNull PhysicalPageRegion region, @NotNull List<WhitespaceRectangle> whitespaces) { final List<WhitespaceRectangle> columnBoundaries = selectCandidateColumnBoundaries(region, whitespaces); /* adjust columns to real height */ if (COLUMNS_ENABLE_COLUMN_HEIGHT_ADJUSTMENT) { adjustColumnHeights(region, columnBoundaries); } filter(region, columnBoundaries); combineColumnBoundaries(region, columnBoundaries); return columnBoundaries; } // -------------------------- STATIC METHODS -------------------------- @Nullable private static WhitespaceRectangle adjustColumn(final PhysicalPageRegion region, final WhitespaceRectangle boundary, final float boundaryStartX, final float boundaryEndX) { final Rectangle rpos = region.getPos(); /* find surrounding content */ final List<PhysicalContent> everythingRightOf = findAllContentsRightOf(region, boundaryStartX); Collections.sort(everythingRightOf, Sorting.sortByLowerYThenLowerX); final List<PhysicalContent> closeOnLeft = findAllContentsImmediatelyLeftOfX(region, boundaryStartX); Collections.sort(closeOnLeft, Sorting.sortByLowerYThenLowerX); float realBoundaryY = rpos.y; float realBoundaryEndY = rpos.endY; boolean startYFound = false; boolean boundaryStarted = false; for (int y = (int) rpos.y; y <= (int) (rpos.endY + 1.0F); y++) { boolean foundContentRightOfX; PhysicalContent closestOnRight = getClosestToTheRightAtY(everythingRightOf, y, boundary.getPos().endX); if (closestOnRight == null) { continue; } else { foundContentRightOfX = true; } /** if we find something blocking this row, start looking further down */ /* content will be blocking if it intersects, naturally */ boolean blocked = false; if (closestOnRight.getPos().x <= boundaryStartX) { blocked = true; } else if (COLUMNS_ENABLE_TEXT_SPLIT_CHECK) { /* also check if this column boundary would separate two words which otherwise are very close */ for (PhysicalContent left : closeOnLeft) { if (left instanceof WhitespaceRectangle) { continue; } if (y < (int) left.getPos().y) { continue; } if (y > (int) left.getPos().endY) { continue; } if (closestOnRight.getPos().x - left.getPos().endX < 6.0f) { blocked = true; break; } } } if (blocked) { if (boundaryStarted) { break; } startYFound = false; } else { if (!startYFound && foundContentRightOfX) { startYFound = true; realBoundaryY = (float) (y - 1); } if (!boundaryStarted && (y > (int) boundary.getPos().y)) { boundaryStarted = true; } realBoundaryEndY = (float) y; } } if (!startYFound) { return null; } final Rectangle adjusted = new Rectangle(boundaryStartX, realBoundaryY + 0.5f, 1.0f, Math.max(0.1f, realBoundaryEndY - realBoundaryY - 0.5F)); final WhitespaceRectangle newBoundary = new WhitespaceRectangle(adjusted); newBoundary.setScore(1000); return newBoundary; } private static void adjustColumnHeights(@NotNull PhysicalPageRegion region, @NotNull List<WhitespaceRectangle> columnBoundaries) { final Collection<WhitespaceRectangle> newBoundaries = new ArrayList<WhitespaceRectangle>(); for (final WhitespaceRectangle boundary : columnBoundaries) { final Rectangle bpos = boundary.getPos(); /* * calculate three possible columns, on the left and right side of the rectangle, * and along the middle */ final float ADJUST = 1.0f; final float leftX = Math.min(bpos.x + ADJUST, bpos.endX); final float leftEndX = Math.min(leftX + COLUMNS_MIN_COLUMN_WIDTH, bpos.endX - ADJUST); final float midX = bpos.getMiddleX(); final float midEndX = Math.min(midX + COLUMNS_MIN_COLUMN_WIDTH, bpos.endX); final float rightEndX = Math.max(bpos.endX - ADJUST, bpos.x); final float rightX = Math.max(rightEndX - COLUMNS_MIN_COLUMN_WIDTH, bpos.x); // final WhitespaceRectangle middle = adjustColumn(region, boundary, midX, midEndX); final WhitespaceRectangle left = adjustColumn(region, boundary, leftX, leftEndX); final WhitespaceRectangle right = adjustColumn(region, boundary, rightX, rightEndX); /* then choose the tallest */ final float lHeight = ((left == null) ? -1.0f : left.getPos().height); final float mHeight = ((middle == null) ? -1.0f : middle.getPos().height); final float rHeight = ((right == null) ? -1.0f : right.getPos().height); // @Nullable final WhitespaceRectangle adjusted; if ((lHeight > mHeight) && (lHeight > rHeight)) { adjusted = left; } else if ((rHeight > mHeight) && (rHeight > lHeight)) { adjusted = right; } else { if (middle != null) { adjusted = middle; } else if (right != null) { adjusted = right; } else if (left != null) { adjusted = left; } else { adjusted = null; } } if ((adjusted != null) &&!newBoundaries.contains(adjusted)) { newBoundaries.add(adjusted); } } columnBoundaries.clear(); columnBoundaries.addAll(newBoundaries); } private static void combineColumnBoundaries(@NotNull PhysicalPageRegion region, @NotNull List<WhitespaceRectangle> columnBoundaries) { for (int i = 0; i < columnBoundaries.size() - 1; i++) { WhitespaceRectangle left = columnBoundaries.get(i); WhitespaceRectangle right = columnBoundaries.get(i + 1); final Rectangle rpos = right.getPos(); final Rectangle lpos = left.getPos(); if (Math.abs(rpos.x - lpos.x) < 50.0F) { /* combine the two. try first to pick a column index at the right hand side */ final float startY = Math.min(rpos.y, lpos.y); final float endY = Math.max(rpos.endY, lpos.endY); float endX = Math.max(rpos.endX, lpos.endX); float startX = endX - DEFAULT_COLUMN_WIDTH; Rectangle newPos = new Rectangle(startX, startY, DEFAULT_COLUMN_WIDTH, endY - startY); final List<PhysicalContent> intersectingR = region.findContentsIntersectingWith(newPos); /* if the first try intersected with something - try left */ if (!intersectingR.isEmpty()) { startX = Math.max(rpos.x, lpos.x); newPos = new Rectangle(startX, startY, DEFAULT_COLUMN_WIDTH, endY - startY); } final List<PhysicalContent> intersectingL = region.findContentsIntersectingWith(newPos); if (!intersectingL.isEmpty()) { continue; } log.warn("LOG01300:Combining column boundaries " + rpos + " and " + lpos); WhitespaceRectangle newBoundary = new WhitespaceRectangle(newPos); newBoundary.setScore(1000); columnBoundaries.set(i, newBoundary); columnBoundaries.remove(i + 1); i--; Collections.sort(columnBoundaries, Sorting.sortByLowerX); } } } private static void filter(final PhysicalPageRegion r, final List<WhitespaceRectangle> boundaries) { List<WhitespaceRectangle> toRemove = new ArrayList<WhitespaceRectangle>(); StringBuilder sb = new StringBuilder(); Collections.sort(boundaries, Sorting.sortByLowerX); for (int i = boundaries.size() - 1; i >= 0; i--) { final WhitespaceRectangle boundary = boundaries.get(i); if (boundary.getPos().height < r.getPos().height * 0.15f) { toRemove.add(boundary); continue; } final float boundaryToTheLeft = 20; // if (i == 0) { // boundaryToTheLeft = r.getPos().x; // } else { // boundaryToTheLeft = boundaries.get(i - 1).getPos().endX; // } final Rectangle bpos = boundary.getPos(); final float searchWidth = bpos.x - boundaryToTheLeft; if (searchWidth <= 0.0F) { toRemove.add(boundary); continue; } Rectangle search = new Rectangle(boundaryToTheLeft, bpos.y, searchWidth, bpos.height); final List<PhysicalContent> contentToTheLeft = r.findContentsIntersectingWith(search); /* demand a certain amount of words on the left side to split */ if (contentToTheLeft.size() < 4) { toRemove.add(boundary); continue; } sb.setLength(0); for (PhysicalContent content : contentToTheLeft) { if (content.isText()) { sb.append(content.getPhysicalText().getText()); } } int charsFound = 0; for (int j = 0; j < sb.length(); j++) { if (chars.indexOf(sb.charAt(j)) == -1) { charsFound++; } } if (charsFound <= 4) { toRemove.add(boundary); continue; } if ((sb.length() < 20) && (charsFound < 10)) { toRemove.add(boundary); continue; } if (boundary.getPos().x < r.getPos().x + r.getPos().width * 0.05f) { toRemove.add(boundary); continue; } if (boundary.getPos().endX > r.getPos().endX - r.getPos().width * 0.05f) { toRemove.add(boundary); continue; } } if (log.isDebugEnabled()) { log.debug("Removing columns" + toRemove); } boundaries.removeAll(toRemove); } @NotNull private static List<PhysicalContent> findAllContentsImmediatelyLeftOfX( @NotNull PhysicalPageRegion region, float x) { final float lookLeft = 10.0f; final Rectangle rpos = region.getPos(); final Rectangle search = new Rectangle(x - lookLeft, rpos.y, lookLeft, rpos.height); return region.findContentsIntersectingWith(search); } @NotNull private static List<PhysicalContent> findAllContentsRightOf(@NotNull PhysicalPageRegion region, float x) { final Rectangle search = new Rectangle(x, region.getPos().y, region.getPos().width, region.getPos().height); return region.findContentsIntersectingWith(search); } /** * @param everythingRightOf x sorted list * @param y row to look at * @param endX * @return */ @Nullable private static PhysicalContent getClosestToTheRightAtY( final List<PhysicalContent> everythingRightOf, final int y, final float endX) { PhysicalContent closest = null; float minDistance = Float.MAX_VALUE; for (int j = 0; j < everythingRightOf.size(); j++) { PhysicalContent content = everythingRightOf.get(j); if ((content instanceof WhitespaceRectangle) || (content instanceof PhysicalPageRegion)) { continue; } final Rectangle blockerPos = content.getPos(); if (blockerPos.endY < (float) y) { continue; } if (blockerPos.y > (float) y) { break; } float distance = blockerPos.x - endX; if (distance < minDistance) { minDistance = distance; closest = content; } } return closest; } @NotNull private static List<WhitespaceRectangle> selectCandidateColumnBoundaries( @NotNull PhysicalPageRegion region, @NotNull List<WhitespaceRectangle> whitespaces) { final float LOOKAHEAD = 10.0f; final float HALF_LOOKAHEAD = LOOKAHEAD / 2.0F; final List<WhitespaceRectangle> columnBoundaries = new ArrayList<WhitespaceRectangle>(); for (WhitespaceRectangle whitespace : whitespaces) { final Rectangle pos = whitespace.getPos(); final float posX = pos.x; final float posEndX = pos.endX; if (pos.height / pos.width <= 1.5f) { continue; } final Rectangle smallerPos = pos.getAdjustedBy(-1.0f); /* count how much text is to the immediate left of the current whitespace */ final List<PhysicalContent> left = region.searchInDirectionFromOrigin(W, smallerPos, LOOKAHEAD); Collections.sort(left, Sorting.sortByHigherX); int leftCount = 0; for (PhysicalContent content : left) { if (content instanceof WhitespaceRectangle) { continue; } if (MathUtils.isWithinVariance(content.getPos().endX, posX + HALF_LOOKAHEAD, LOOKAHEAD)) { leftCount++; } } /* and how much is to the right */ final List<PhysicalContent> right = region.searchInDirectionFromOrigin(E, smallerPos, LOOKAHEAD); Collections.sort(right, Sorting.sortByLowerX); int rightCount = 0; for (PhysicalContent content : right) { if (content instanceof WhitespaceRectangle) { continue; } if (MathUtils.isWithinVariance(content.getPos().x, posEndX + HALF_LOOKAHEAD, LOOKAHEAD)) { rightCount++; } } if ((leftCount == 0) && (rightCount < 8)) { continue; } if ((rightCount == 0) && (leftCount < 8)) { continue; } if ((leftCount >= 3) || (rightCount >= 3)) { columnBoundaries.add(whitespace); whitespace.setScore(500); } } return columnBoundaries; } }