/*
* Copyright 2010-2011 Øyvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract.physical.column;
import org.apache.log4j.Logger;
import org.elacin.pdfextract.Constants;
import org.elacin.pdfextract.content.PhysicalContent;
import org.elacin.pdfextract.content.PhysicalPageRegion;
import org.elacin.pdfextract.content.WhitespaceRectangle;
import org.elacin.pdfextract.geom.FloatPoint;
import org.elacin.pdfextract.geom.HasPosition;
import org.elacin.pdfextract.geom.Rectangle;
import org.elacin.pdfextract.geom.RectangleCollection;
import java.util.ArrayList;
import java.util.List;
import java.util.PriorityQueue;
import static org.elacin.pdfextract.Constants.*;
import static org.elacin.pdfextract.geom.RectangleCollection.Direction.E;
import static org.elacin.pdfextract.geom.RectangleCollection.Direction.W;
/**
* Created by IntelliJ IDEA. User: elacin Date: Jun 23, 2010 Time: 13:05:06
*/
public final class WhitespaceFinder {
// ------------------------------ FIELDS ------------------------------
private static final Logger log = Logger.getLogger(WhitespaceFinder.class);
/* min[Height|Width] are the thinnest rectangles we will accept */
private final float minHeight, minWidth;
/* all the obstacles in the algorithm are found here, and are initially all
the words on the page */
protected final RectangleCollection region;
/**
* State while working follows below
*/
/* a queue which will give us the biggest/best rectangles first */
private final PriorityQueue<QueueEntry> queue;
/* this holds a list of all queue entries which are not yet accepted. Upon finding a new
* whitespace rectangle, these are added back to the queue. */
private final List<QueueEntry> holdList = new ArrayList<QueueEntry>();
/* this holds all the whitespace rectangles we have found */
private final WhitespaceRectangle[] foundWhitespace;
private int foundWhitespaceCount = 0;
/* the number of whitespace we want to find */
private final int wantedWhitespaces;
// --------------------------- CONSTRUCTORS ---------------------------
WhitespaceFinder(RectangleCollection region, final int numWantedWhitespaces, final float minWidth,
final float minHeight) {
this.region = region;
wantedWhitespaces = numWantedWhitespaces;
foundWhitespace = new WhitespaceRectangle[numWantedWhitespaces];
queue = new PriorityQueue<QueueEntry>(WHITESPACE_MAX_QUEUE_SIZE);
this.minWidth = minWidth;
this.minHeight = minHeight;
}
// -------------------------- PUBLIC STATIC METHODS --------------------------
public static List<WhitespaceRectangle> findWhitespace(final PhysicalPageRegion region) {
final long t0 = System.currentTimeMillis();
final int numWhitespaces = WHITESPACE_NUMBER_WANTED;
WhitespaceFinder finder = new WhitespaceFinder(region, numWhitespaces,
region.getMinimumColumnSpacing(), region.getMinimumRowSpacing());
final List<WhitespaceRectangle> ret = finder.findWhitespace();
final long time = System.currentTimeMillis() - t0;
log.info(String.format("LOG00380:%d of %d whitespaces for %s in %d ms", ret.size(),
numWhitespaces, region, time));
return ret;
}
// -------------------------- STATIC METHODS --------------------------
/**
* Finds the obstacle which is closest to the centre of the rectangle bound
*/
static HasPosition choosePivot(QueueEntry entry) {
final FloatPoint centrePoint = entry.bound.centre();
float minDistance = Float.MAX_VALUE;
HasPosition closestToCentre = entry.obstacles[0];
for (int i = 0; i < entry.numObstacles; i++) {
HasPosition obstacle = entry.obstacles[i];
final float distance = obstacle.getPos().distance(centrePoint) * 100.0f
/ obstacle.getPos().height;
if (distance < minDistance) {
minDistance = distance;
closestToCentre = obstacle;
}
}
return closestToCentre;
}
/**
* Checks whether the rectangle represented by whitespaceCandidate is empty enough to be
* considered a whitespace rectangle
*/
static boolean isEmptyEnough(QueueEntry whitespaceCandidate) {
if (Constants.WHITESPACE_FUZZY_EMPTY_CHECK && (whitespaceCandidate.numObstacles != 0)) {
/* accept a small intersection */
float intersectSum = 0.0f,
whitespaceArea = whitespaceCandidate.bound.area();
final float intersectLimit = whitespaceArea * WHITESPACE_FUZZINESS;
for (int i = 0; i < whitespaceCandidate.numObstacles; i++) {
final Rectangle obstaclePos = whitespaceCandidate.obstacles[i].getPos();
final float intersectSize = whitespaceCandidate.bound.intersection(
obstaclePos).area();
final float smallestArea = Math.min(obstaclePos.area(), whitespaceArea);
if (intersectSize > smallestArea * WHITESPACE_FUZZINESS) {
return false;
}
intersectSum += intersectSize;
}
return intersectSum < intersectLimit;
}
return whitespaceCandidate.numObstacles == 0;
}
/**
* This is the quality function by which we sort rectangles to choose the 'best' one first. The
* current function bases itself on the area of the rectangle, and then prefers high ones
*/
static float rectangleQuality(Rectangle r) {
return r.area() * (1 + r.height * 0.25f);
}
// -------------------------- OTHER METHODS --------------------------
/**
* The main algorithm. Finds the next whitespace rectangle
* @return A new identified whitespace rectangle
*/
WhitespaceRectangle findNextWhitespace() {
queue.addAll(holdList);
holdList.clear();
while (!queue.isEmpty()) {
/** Place an upper bound. If we reach this queue size we should already have enough data */
if (WHITESPACE_MAX_QUEUE_SIZE - 4 <= queue.size()) {
log.warn("Queue too long");
return null;
}
/** this will always choose the rectangle with the highest priority */
final QueueEntry current = queue.remove();
/**
* If we have accepted a whitespace rectangle since this was added to the queue, we need
* to recalculate the obstacles it references to make sure it doesnt overlap
*/
if (current.numberOfWhitespaceFound != foundWhitespaceCount) {
updateObstacleListForQueueEntry(current);
}
/**
* if this contains no obstacles (or just barely touches on some) we have found a
* new whitespace rectangle
*/
if (isEmptyEnough(current)) {
final WhitespaceRectangle newWhitespace = new WhitespaceRectangle(current.bound);
/** check if we accept the whitespace rectangle or not */
/* check whether the whitespace is connected to either an edge or an existing
* whitespace. if it is not, leave it in the holdList list for now */
if (WHITESPACE_CHECK_CONNECTED_FROM_EDGE &&!isNextToWhitespaceOrEdge(newWhitespace)) {
holdList.add(current);
continue;
}
/* find all the surrounding content. make sure this rectangle is not too small.
* This is an expensive check, which is why it is done here. i think it is still
* correct. */
if (WHITESPACE_CHECK_LOCAL_HEIGHT) {
if (isWhitespaceTooShortForSurroundingText(newWhitespace)) {
continue;
}
}
/* we do not want to accept whitespace rectangles which has only one or two words
* on each side (0 is fine), as these doesn't affect layout and tend to break up
* small paragraphs of text unnecessarily */
if (WHITESPACE_CHECK_TEXT_BOTH_SIDES) {
if (isWhitespaceNeedlesslySeparatingText(newWhitespace)) {
continue;
}
}
return newWhitespace;
}
/** choose an obstacle near the middle of the current rectangle */
final HasPosition pivot = choosePivot(current);
/**
* Create four subrectangles, one on each side of the pivot, and determine the obstacles
* located inside it. Then add each subrectangle to the queue (as long as it is not too
* thin)
*/
final QueueEntry[] subrectangles = splitSearchAreaAround(current, pivot);
for (QueueEntry sub : subrectangles) {
if (sub == null) {
continue;
}
queue.add(sub);
}
}
/* if we ran out of rectangles in the queue, return null to signal that. */
return null;
}
/**
* This method provides a personal touch to the algorithm described in the paper which is
* referenced. Here we will just accept rectangles which are adjacent to either another one
* which we have already identified, or which are adjacent to the edge of the page.
* <p/>
* By assuring that the we thus form continous chains of rectangles, the results seem to be much
* better.
*/
final boolean isNextToWhitespaceOrEdge(final WhitespaceRectangle newWhitespace) {
/* accept this rectangle if it is adjacent to the edge of the page */
final float l = WHITESPACE_OBSTACLE_OVERLAP;
final Rectangle wPos = newWhitespace.getPos(),
rPos = region.getPos();
if ((wPos.x <= rPos.x + l) || (wPos.y <= rPos.y + l) || (wPos.endX >= rPos.endX - l)
|| (wPos.endY >= rPos.endY - l)) {
return true;
}
/* also accept if it borders one of the already identified whitespaces */
for (int i = 0; i < foundWhitespaceCount; i++) {
final WhitespaceRectangle existing = foundWhitespace[i];
if (wPos.distance(existing.getPos()) <= WHITESPACE_OBSTACLE_OVERLAP) {
return true;
}
}
return false;
}
/**
* Finds up to the requested amount of whitespace rectangles based on the contents on the page
* which has been provided.
*
* @return whitespace rectangles
*/
List<WhitespaceRectangle> findWhitespace() {
if (foundWhitespaceCount == 0) {
/* first add the whole page (all its contents as obstacle)s to the priority queue */
int obstacleCount = region.getContents().size();
HasPosition[] obstacles = region.getContents().toArray(new HasPosition[obstacleCount]);
queue.add(new QueueEntry(region.getPos(), obstacles, obstacleCount, 0));
/* continue looking for whitespace until we have the wanted number or we run out */
while (foundWhitespaceCount < wantedWhitespaces) {
final WhitespaceRectangle newRectangle = findNextWhitespace();
/* if no further rectangles exist, stop looking */
if (newRectangle == null) {
break;
}
foundWhitespace[foundWhitespaceCount++] = newRectangle;
}
}
ArrayList<WhitespaceRectangle> ret = new ArrayList<WhitespaceRectangle>(foundWhitespaceCount);
for (int i = 0; i < foundWhitespaceCount; i++) {
ret.add(foundWhitespace[i]);
}
return ret;
}
/**
* Check if the whitespace rectangle is made useless by the way it separates text. see thesis
* text for details.
*/
boolean isWhitespaceNeedlesslySeparatingText(final WhitespaceRectangle newWhitespace) {
if (newWhitespace.getPos().width > 30) {
return false;
}
/* decrease the size a tiny bit, so we don't include what blocked the rectangle, especially
* above and below */
Rectangle search = newWhitespace.getPos().getAdjustedBy(-1.0f);
final float range = 8.0f;
final List<PhysicalContent> right = region.searchInDirectionFromOrigin(E, search, range);
int rightCount = 0;
for (PhysicalContent content : right) {
if (content.isText()) {
rightCount++;
}
}
if ((rightCount == 1) || (rightCount == 2)) {
final List<PhysicalContent> left = region.searchInDirectionFromOrigin(W, search, range);
int leftCount = 0;
for (PhysicalContent content : left) {
if (content.isText()) {
leftCount++;
}
}
if ((leftCount == 1) || (leftCount == 2)) {
return true;
}
}
return false;
}
/**
* Check if newWhitespace is too small considering the surrounding content
*/
boolean isWhitespaceTooShortForSurroundingText(final WhitespaceRectangle newWhitespace) {
final List<PhysicalContent> surroundings = region.findSurrounding(newWhitespace, 8);
if (!surroundings.isEmpty()) {
float averageHeight = 0.0f;
int counted = 0;
for (PhysicalContent surrounding : surroundings) {
if (surrounding.isText()) {
averageHeight += surrounding.getPos().height;
counted++;
}
}
if (counted != 0) {
averageHeight /= (float) counted;
float u = Math.max(((PhysicalPageRegion) region).getMinimumRowSpacing(), averageHeight);
if (u > newWhitespace.getPos().height) {
return true;
}
}
}
return false;
}
/**
* Creates four rectangles with the remaining space left after splitting the current rectangle
* around the pivot. Also divides the obstacles among the newly created rectangles
*/
QueueEntry[] splitSearchAreaAround(final QueueEntry current, final HasPosition pivot) {
/* Everything inside here was the definitely most expensive parts of the implementation,
* so this is quite optimized to avoid too many float point comparisons and needless
* object creations. This cut execution time by some 90ish % :) */
final int missingRectangles = wantedWhitespaces - foundWhitespaceCount;
final float splitX = pivot.getPos().x,
splitEndX = pivot.getPos().endX,
splitY = pivot.getPos().y,
splitEndY = pivot.getPos().endY;
final Rectangle bound = current.bound;
/* check which of the four possible subrectangles we want to create, and their dimensions */
Rectangle left = null;
HasPosition[] leftObs = null;
final float leftWidth = splitX - bound.x;
if ((splitX > bound.x) && (leftWidth > minWidth)) {
left = new Rectangle(bound.x, bound.y, leftWidth, bound.height);
leftObs = new HasPosition[current.numObstacles + missingRectangles];
}
Rectangle above = null;
HasPosition[] aboveObs = null;
final float aboveHeight = splitY - bound.y;
if ((splitY > bound.y) && (aboveHeight > minHeight)) {
above = new Rectangle(bound.x, bound.y, bound.width, aboveHeight);
aboveObs = new HasPosition[current.numObstacles + missingRectangles];
}
Rectangle right = null;
HasPosition[] rightObs = null;
final float rightWidth = bound.endX - splitEndX;
if ((splitEndX < bound.endX) && (rightWidth > minWidth)) {
right = new Rectangle(splitEndX, bound.y, rightWidth, bound.height);
rightObs = new HasPosition[current.numObstacles + missingRectangles];
}
Rectangle below = null;
HasPosition[] belowObs = null;
final float belowHeight = bound.endY - splitEndY;
if ((splitEndY < bound.endY) && (belowHeight > minHeight)) {
below = new Rectangle(bound.x, splitEndY, bound.width, belowHeight);
belowObs = new HasPosition[current.numObstacles + missingRectangles];
}
/**
* All the obstacles in current already fit within current.bound, so we can do just a quick
* check to see where they belong here. this way of doing it is primarily an optimization
*/
int leftIndex = 0,
aboveIndex = 0,
rightIndex = 0,
belowIndex = 0;
final float adjustedSplitX = splitX - WHITESPACE_OBSTACLE_OVERLAP,
adjustedSplitY = splitY - WHITESPACE_OBSTACLE_OVERLAP,
adjustedSplitEndX = splitEndX + WHITESPACE_OBSTACLE_OVERLAP,
adjustedSplitEndY = splitEndY + WHITESPACE_OBSTACLE_OVERLAP;
for (int i = 0; i < current.numObstacles; i++) {
HasPosition obstacle = current.obstacles[i];
final Rectangle obstaclePos = obstacle.getPos();
/* including the pivot will break the algorithm */
if (obstacle == pivot) {
continue;
}
if ((left != null) && (obstaclePos.x < adjustedSplitX)) {
leftObs[leftIndex++] = obstacle;
}
if ((right != null) && (obstaclePos.endX > adjustedSplitEndX)) {
rightObs[rightIndex++] = obstacle;
}
if ((above != null) && (obstaclePos.y < adjustedSplitY)) {
aboveObs[aboveIndex++] = obstacle;
}
if ((below != null) && (obstaclePos.endY > adjustedSplitEndY)) {
belowObs[belowIndex++] = obstacle;
}
}
final int n = foundWhitespaceCount;
return new QueueEntry[] { (left == null) ? null : new QueueEntry(left, leftObs, leftIndex, n),
(right == null)
? null : new QueueEntry(right, rightObs, rightIndex, n),
(above == null)
? null : new QueueEntry(above, aboveObs, aboveIndex, n),
(below == null)
? null : new QueueEntry(below, belowObs, belowIndex, n) };
}
/**
* Checks if some of the newly added whitespace rectangles, that is those discovered after this
* queue entry was added to the queue, overlaps with the area of this queue entry, and if so
* adds them to this list of obstacles .
*/
void updateObstacleListForQueueEntry(final QueueEntry entry) {
int numNewestObstaclesToCheck = foundWhitespaceCount - entry.numberOfWhitespaceFound;
for (int i = 0; i < numNewestObstaclesToCheck; i++) {
final HasPosition obstacle = foundWhitespace[foundWhitespaceCount - 1 - i];
if (entry.bound.intersectsAdmittingOverlap(obstacle.getPos(), WHITESPACE_OBSTACLE_OVERLAP)) {
entry.addObstacle(obstacle);
}
entry.numberOfWhitespaceFound = foundWhitespaceCount;
}
}
// -------------------------- INNER CLASSES --------------------------
static class QueueEntry implements Comparable<QueueEntry> {
final Rectangle bound;
int numberOfWhitespaceFound, numObstacles;
final HasPosition[] obstacles;
final float quality;
private QueueEntry(final Rectangle bound, final HasPosition[] obstacles, int numObstacles,
int numFound) {
this.bound = bound;
this.obstacles = obstacles;
this.numObstacles = numObstacles;
numberOfWhitespaceFound = numFound;
quality = rectangleQuality(bound);
}
public final int compareTo(final QueueEntry other) {
return Float.compare(other.quality, quality);
}
public void addObstacle(HasPosition obstacle) {
obstacles[numObstacles++] = obstacle;
}
}
}