/*
* Copyright 2010-2011 Øyvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract.content;
import org.apache.log4j.Logger;
import org.elacin.pdfextract.Constants;
import org.elacin.pdfextract.geom.HasPosition;
import org.elacin.pdfextract.geom.Rectangle;
import org.elacin.pdfextract.geom.RectangleCollection;
import org.elacin.pdfextract.style.Style;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* Created by IntelliJ IDEA. User: elacin Date: Nov 8, 2010 Time: 7:44:41 PM To change this template
* use File | Settings | File Templates.
*/
public class PhysicalPageRegion extends RectangleCollection {
// ------------------------------ FIELDS ------------------------------
private static final Logger log = Logger.getLogger(PhysicalPageRegion.class);
/* content */
@NotNull
private final List<PhysicalPageRegion> subregions = new ArrayList<PhysicalPageRegion>();
@NotNull
private final List<WhitespaceRectangle> whitespace = new ArrayList<WhitespaceRectangle>();
/* the physical page containing this region */
@NotNull
private final PhysicalPage page;
/* the graphics containing this region */
@Nullable
private GraphicContent containingGraphic;
/* average font sizes for this page region */
private transient float _avgFontSizeX;
private transient float _avgFontSizeY;
private transient int _medianOfVerticalDistances;
private transient boolean fontInfoFound;
private transient boolean medianFound;
// --------------------------- CONSTRUCTORS ---------------------------
public PhysicalPageRegion(@NotNull final List<? extends PhysicalContent> contents,
@NotNull PhysicalPage page) {
this(contents, null, page);
}
public PhysicalPageRegion(@NotNull final Collection<? extends PhysicalContent> contents,
@Nullable final PhysicalPageRegion parent, @NotNull PhysicalPage page) {
super(contents, parent);
this.page = page;
}
// ------------------------ INTERFACE METHODS ------------------------
// --------------------- Interface HasPosition ---------------------
@Override
public void calculatePos() {
if (Constants.WHITESPACE_USE_WHOLE_PAGE && (page.getMainRegion() == this)) {
setPos(page.getPageDimensions());
}
super.calculatePos();
if (isGraphicalRegion()) {
setPos(getPos().union(containingGraphic.getPos()));
}
}
// ------------------------ OVERRIDING METHODS ------------------------
@Override
public void clearCache() {
super.clearCache();
fontInfoFound = false;
medianFound = false;
}
// --------------------- GETTER / SETTER METHODS ---------------------
@Nullable
public GraphicContent getContainingGraphic() {
return containingGraphic;
}
@NotNull
public PhysicalPage getPage() {
return page;
}
@NotNull
public List<PhysicalPageRegion> getSubregions() {
return subregions;
}
@NotNull
public List<WhitespaceRectangle> getWhitespace() {
return whitespace;
}
// -------------------------- PUBLIC METHODS --------------------------
public void addSubRegion(final PhysicalPageRegion newSub) {
addContent(newSub);
subregions.add(newSub);
}
public void addWhitespace(final Collection<WhitespaceRectangle> whitespace) {
this.whitespace.addAll(whitespace);
addContents(whitespace);
}
public void ensureAllContentInLeafNodes() {
if (!subregions.isEmpty()) {
Collection<PhysicalContent> contents = new ArrayList<PhysicalContent>();
for (PhysicalContent content : getContents()) {
if (!(content instanceof PhysicalPageRegion)) {
contents.add(content);
}
}
doExtractSubRegion(contents, null, null, true);
}
for (int i = 0, subregionsSize = subregions.size(); i < subregionsSize; i++) {
subregions.get(i).ensureAllContentInLeafNodes();
}
}
public boolean extractSubRegionFromBound(@NotNull Rectangle bound, boolean addToParent) {
final List<PhysicalContent> subContents = findContentsIntersectingWith(bound.getPos());
return doExtractSubRegion(subContents, bound, null, addToParent);
}
/**
* Returns a subregion with all the contents which is contained by bound. If more than two
* pieces of content crosses the boundary of bound, it is deemed inappropriate for dividing the
* page, and an exception is thrown
*
* @return the new region
*/
public void extractSubRegionFromGraphic(@NotNull final GraphicContent graphic, boolean addToParent) {
/* we can allow us to search a bit outside the graphic */
final Rectangle pos = graphic.getPos();
final float extra = 2.0f;
final Rectangle searchPos = new Rectangle(pos.x - extra, pos.y - extra, pos.width + 2 * extra,
pos.height + 2 * extra);
final List<PhysicalContent> subContents = findContentsIntersectingWith(searchPos);
doExtractSubRegion(subContents, graphic, graphic, addToParent);
}
public float getAvgFontSizeX() {
if (!fontInfoFound) {
findAndSetFontInformation();
}
return _avgFontSizeX;
}
public float getAvgFontSizeY() {
if (!fontInfoFound) {
findAndSetFontInformation();
}
return _avgFontSizeY;
}
public int getMedianOfVerticalDistances() {
if (!medianFound) {
findAndSetMedianOfVerticalDistancesForRegion();
}
return _medianOfVerticalDistances;
}
public float getMinimumColumnSpacing() {
return getAvgFontSizeX() * 0.8f;
}
public float getMinimumRowSpacing() {
if (!fontInfoFound) {
findAndSetFontInformation();
}
return (float) getMedianOfVerticalDistances() * 1.1f;
}
public boolean isGraphicalRegion() {
return containingGraphic != null;
}
public void removeSubRegion(final PhysicalPageRegion newSub) {
removeContent(newSub);
subregions.remove(newSub);
}
public void setContainingGraphic(@Nullable GraphicContent containingGraphic) {
if (this.containingGraphic != null) {
removeContent(this.containingGraphic);
}
if (containingGraphic != null) {
this.containingGraphic = containingGraphic;
// addContent(containingGraphic);
}
}
// -------------------------- OTHER METHODS --------------------------
private boolean doExtractSubRegion(@NotNull final Collection<PhysicalContent> subContents,
@Nullable final HasPosition bound,
@Nullable final GraphicContent graphic, boolean addToParent) {
if (subContents.isEmpty()) {
if (log.isInfoEnabled()) {
log.info("LOG00960:bound " + bound + " contains no content in " + this + ". wont "
+ "extract");
}
return false;
}
if (subContents.size() == getContents().size()) {
if (log.isInfoEnabled()) {
log.info("LOG00950:bound " + bound + " contains all content in " + this + ". wont "
+ "extract");
}
return false;
}
/**
* whitespace rectangles might be important for layout both in this region and in the one
* which will be extracted, so leave them in both
*/
boolean onlyWhitespace = true;
List<WhitespaceRectangle> saveWhitespace = new ArrayList<WhitespaceRectangle>();
for (PhysicalContent subContent : subContents) {
if (subContent instanceof WhitespaceRectangle) {
saveWhitespace.add((WhitespaceRectangle) subContent);
} else {
onlyWhitespace = false;
}
}
/* dont bother to create a region for only whitespace */
if (onlyWhitespace) {
if (log.isInfoEnabled()) {
log.info("LOG01330:Tried to extract only whitespace. ignoring");
}
return false;
}
final PhysicalPageRegion newRegion = new PhysicalPageRegion(subContents, this, page);
if (graphic == null) {
newRegion.setContainingGraphic(containingGraphic);
} else {
newRegion.setContainingGraphic(graphic);
}
/* move this regions subregions if they are contained by the new region */
List<PhysicalPageRegion> toMove = new ArrayList<PhysicalPageRegion>();
for (PhysicalContent subContent : subContents) {
if (subContent instanceof PhysicalPageRegion) {
toMove.add((PhysicalPageRegion) subContent);
}
}
for (PhysicalPageRegion subRegionToMove : toMove) {
subregions.remove(subRegionToMove);
newRegion.subregions.add(subRegionToMove);
}
log.info("LOG00890:Extracted PPR:" + newRegion + " from " + this);
removeContents(subContents);
addContents(saveWhitespace);
if (addToParent && (getParent() != null) && (getParent() instanceof PhysicalPageRegion)) {
((PhysicalPageRegion) getParent()).addSubRegion(newRegion);
} else {
addSubRegion(newRegion);
}
return true;
}
/* find average font sizes, and most used style for the region */
protected void findAndSetFontInformation() {
float xFontSizeSum = 0.0f,
yFontSizeSum = 0.0f;
int numCharsFound = 0;
for (PhysicalContent content : getContents()) {
if (content.isText()) {
final Style style = content.getPhysicalText().getStyle();
final int length = content.getPhysicalText().getText().length();
xFontSizeSum += (float) (style.xSize * length);
yFontSizeSum += (float) (style.ySize * length);
numCharsFound += length;
}
}
if (numCharsFound == 0) {
_avgFontSizeX = Float.MIN_VALUE;
_avgFontSizeY = Float.MIN_VALUE;
} else {
_avgFontSizeX = xFontSizeSum / (float) numCharsFound;
_avgFontSizeY = yFontSizeSum / (float) numCharsFound;
}
fontInfoFound = true;
}
/**
* Finds an approximation of the normal vertical line spacing for the region. <p/> This is done
* by looking at three vertical rays, calculating distances between all the lines intersecting
* those lines, and then returning the median of all those distances. <p/> minimum value is 2,
* max is (avgFontSize * 3) -1. -1 if nothing is found;
*/
protected int findAndSetMedianOfVerticalDistancesForRegion() {
final int LIMIT = (int) getAvgFontSizeY() * 3;
int[] distanceCount = new int[LIMIT];
for (float x = getPos().x; x <= getPos().endX; x += getPos().width / 3) {
final List<PhysicalContent> column = findContentAtXIndex(x);
for (int i = 1; i < column.size(); i++) {
final PhysicalContent current = column.get(i - 1);
final PhysicalContent below = column.get(i);
/* increase count for this distance (rounded down to an int) */
final int d = (int) (below.getPos().y - current.getPos().endY);
if ((d > 0) && (d < LIMIT)) {
distanceCount[d]++;
}
}
}
int highestFrequency = -1;
int index = -1;
for (int i = 2; i < distanceCount.length; i++) {
if (distanceCount[i] >= highestFrequency) {
index = i;
highestFrequency = distanceCount[i];
}
}
float temp = (float) Math.max(index, (int) (getAvgFontSizeY() * 0.5f));
_medianOfVerticalDistances = (int) (temp + Math.max(1.0f, temp * 0.1f));
medianFound = true;
return _medianOfVerticalDistances;
}
}