/*
* Copyright 2010-2011 Øyvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract.physical.graphics;
import org.apache.log4j.Logger;
import org.elacin.pdfextract.content.GraphicContent;
import org.elacin.pdfextract.content.PhysicalContent;
import org.elacin.pdfextract.content.PhysicalPageRegion;
import org.elacin.pdfextract.formula.Formulas;
import org.elacin.pdfextract.geom.Rectangle;
import org.elacin.pdfextract.geom.Sorting;
import org.elacin.pdfextract.style.Style;
import org.jetbrains.annotations.NotNull;
import java.util.*;
/**
* Created by IntelliJ IDEA. User: elacin Date: 13.11.10 Time: 03.29 To change this template use
* File | Settings | File Templates.
*/
public class GraphicSegmentatorImpl implements GraphicSegmentator {
// ------------------------------ FIELDS ------------------------------
private static final Logger log = Logger.getLogger(GraphicSegmentatorImpl.class);
/* these are what might be rendered with normal font, so they are in addition to what
Formulas.containsMath would find*/
private static String POSSIBLE_MATH_SYMBOLS = "()-+";
private final float h;
/* we need the pages dimensions here, because the size of regions is calculated based on content.
* it should be possible for graphic to cover all the contents if it doesnt cover all the page*/
private final float w;
// --------------------------- CONSTRUCTORS ---------------------------
public GraphicSegmentatorImpl(final Rectangle dims) {
w = dims.width;
h = dims.height;
}
// ------------------------ INTERFACE METHODS ------------------------
// --------------------- Interface GraphicSegmentator ---------------------
@NotNull
public CategorizedGraphics categorizeGraphics(@NotNull List<GraphicContent> graphics,
@NotNull PhysicalPageRegion region) {
CategorizedGraphics ret = new CategorizedGraphics();
categorizeGraphics(ret, region, graphics);
/*
* this is a hack to deal with situations where one creates a table or similar with
* horizontal lines only. These would not be separators
*/
List<GraphicContent> combinedHSeps = combineHorizontalSeparators(ret);
categorizeGraphics(ret, region, combinedHSeps);
Collections.sort(ret.getHorizontalSeparators(), Sorting.sortByLowerY);
Collections.sort(ret.getVerticalSeparators(), Sorting.sortByLowerX);
if (log.isInfoEnabled()) {
logGraphics(ret);
}
return ret;
}
// -------------------------- PUBLIC STATIC METHODS --------------------------
/**
* consider the graphic a separator if the aspect ratio is high
*/
public static boolean canBeConsideredHorizontalSeparator(@NotNull GraphicContent g) {
if (g.getPos().height > 15.0f) {
return false;
}
return g.getPos().width / g.getPos().height > 10.0f;
}
public static boolean canBeConsideredMathBarInRegion(@NotNull GraphicContent g,
@NotNull final PhysicalPageRegion region) {
if (g.getPos().height > 5.0f) {
return false;
}
if (g.getPos().width / g.getPos().height < 6.0f) {
return false;
}
final List<PhysicalContent> surrounding = region.findSurrounding(g, 10);
boolean foundOver = false,
foundUnder = false,
foundMath = false;
for (PhysicalContent content : surrounding) {
if (content.getPos().y < g.getPos().endY) {
foundUnder = true;
}
if (content.getPos().endY > g.getPos().y) {
foundOver = true;
}
if (content.isText()) {
if (Formulas.textContainsMath(content.getPhysicalText())) {
foundMath = true;
} else {
final String text = content.getPhysicalText().getText();
for (int i = 0; i < text.length(); i++) {
if (POSSIBLE_MATH_SYMBOLS.indexOf(text.charAt(i)) != -1) {
foundMath = true;
break;
}
}
}
}
if (foundOver && foundUnder && foundMath) {
return true;
}
}
return false;
}
/**
* consider the graphic a separator if the aspect ratio is high
*/
public static boolean canBeConsideredVerticalSeparator(@NotNull GraphicContent g) {
if (g.getPos().width > 15.0f) {
return false;
}
return g.getPos().height / g.getPos().width > 15.0f;
}
// -------------------------- STATIC METHODS --------------------------
private static boolean graphicContainsTextFromRegion(@NotNull final PhysicalPageRegion region,
@NotNull final GraphicContent graphic) {
final int limit = 5;
int found = 0;
for (PhysicalContent content : region.getContents()) {
if (graphic.getPos().contains(content.getPos())) {
found++;
}
if (found == limit) {
return true;
}
}
return false;
}
// -------------------------- OTHER METHODS --------------------------
@NotNull
private List<GraphicContent> combineHorizontalSeparators(@NotNull CategorizedGraphics ret) {
Map<String, List<GraphicContent>> hsepsForXCoordinate = new HashMap<String,
List<GraphicContent>>();
for (int i = 0; i < ret.getHorizontalSeparators().size(); i++) {
GraphicContent hsep = ret.getHorizontalSeparators().get(i);
int x = ((int) hsep.getPos().x) / 3; // divide by three as rounding
int w = ((int) hsep.getPos().width) / 3;
String combineString = String.valueOf(x) + hsep.getColor() + w;
if (!hsepsForXCoordinate.containsKey(combineString)) {
hsepsForXCoordinate.put(combineString, new ArrayList<GraphicContent>());
}
hsepsForXCoordinate.get(combineString).add(hsep);
}
List<GraphicContent> combinedGraphics = new ArrayList<GraphicContent>();
for (List<GraphicContent> sepList : hsepsForXCoordinate.values()) {
if (sepList.size() < 2) {
continue;
}
Collections.sort(sepList, Sorting.sortByLowerY);
if (log.isInfoEnabled()) {
log.info("LOG00970:Combining " + sepList);
}
ret.getHorizontalSeparators().removeAll(sepList);
GraphicContent newlyCombined = sepList.get(0);
for (int i = 1; i < sepList.size(); i++) {
GraphicContent graphicPart = sepList.get(i);
if (newlyCombined.getPos().distance(graphicPart.getPos()) > 50.0f) {
combinedGraphics.add(newlyCombined);
newlyCombined = graphicPart;
} else {
newlyCombined = newlyCombined.combineWith(graphicPart);
}
}
combinedGraphics.add(newlyCombined);
}
return combinedGraphics;
}
private void categorizeGraphics(@NotNull CategorizedGraphics ret,
@NotNull PhysicalPageRegion region,
@NotNull List<GraphicContent> list) {
for (GraphicContent graphic : list) {
if (isTooBigGraphic(graphic)) {
if (log.isInfoEnabled()) {
log.info("LOG00501:considered too big " + graphic);
}
continue;
}
if (graphicContainsTextFromRegion(region, graphic)) {
graphic.setCanBeAssigned(false);
graphic.setStyle(Style.GRAPHIC_CONTAINER);
ret.getContainers().add(graphic);
} else if (canBeConsideredMathBarInRegion(graphic, region)) {
graphic.setCanBeAssigned(true);
graphic.setStyle(Style.GRAPHIC_MATH_BAR);
ret.getContents().add(graphic);
// } else if (canBeConsideredHorizontalSeparator(graphic)) {
// graphic.setCanBeAssigned(true);
// graphic.setStyle(Style.GRAPHIC_HSEP);
// ret.getHorizontalSeparators().add(graphic);
// } else if (canBeConsideredVerticalSeparator(graphic)) {
// graphic.setCanBeAssigned(true);
// graphic.setStyle(Style.GRAPHIC_VSEP);
// ret.getVerticalSeparators().add(graphic);
// } else if (canBeConsideredCharacterInRegion(graphic, region)) {
// graphic.setStyle(Style.GRAPHIC_CHARACTER);
// graphic.setCanBeAssigned(true);
// ret.getContents().add(graphic);
} else {
graphic.setCanBeAssigned(true);
graphic.setStyle(Style.GRAPHIC_IMAGE);
ret.getContents().add(graphic);
}
ret.getGraphicsToRender().add(graphic);
}
}
private boolean isTooBigGraphic(@NotNull final PhysicalContent graphic) {
return graphic.getPos().area() >= (w * h);
}
private void logGraphics(@NotNull CategorizedGraphics ret) {
for (GraphicContent g : ret.getContainers()) {
log.info("LOG00502:considered container: " + g);
}
for (GraphicContent g : ret.getHorizontalSeparators()) {
log.info("LOG00505:considered hsep: " + g);
}
for (GraphicContent g : ret.getVerticalSeparators()) {
log.info("LOG00506:considered vsep: " + g);
}
for (GraphicContent g : ret.getContents()) {
log.info("LOG00980:considered content: " + g);
}
}
}