/*
* Copyright 2010-2011 Øyvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract.physical.paragraph;
import org.apache.log4j.Logger;
import org.elacin.pdfextract.physical.ParagraphNumberer;
import org.elacin.pdfextract.style.Style;
import org.elacin.pdfextract.style.StyleComparator;
import org.elacin.pdfextract.tree.LineNode;
import org.elacin.pdfextract.tree.ParagraphNode;
import org.elacin.pdfextract.tree.WordNode;
import org.jetbrains.annotations.NotNull;
import java.util.ArrayList;
import java.util.List;
import static org.elacin.pdfextract.Constants.SPLIT_PARAGRAPHS_BY_STYLES;
/**
* Created by IntelliJ IDEA. User: elacin Date: 17.11.10 Time: 04.45 To change this template use
* File | Settings | File Templates.
*/
public class ParagraphSegmentator {
// ------------------------------ FIELDS ------------------------------
private static final Logger log = Logger.getLogger(ParagraphSegmentator.class);
private float medianVerticalSpacing = -1.0f;
// --------------------- GETTER / SETTER METHODS ---------------------
public void setMedianVerticalSpacing(final int medianVerticalSpacing) {
this.medianVerticalSpacing = medianVerticalSpacing;
}
// -------------------------- PUBLIC METHODS --------------------------
@NotNull
public List<ParagraphNode> segmentParagraphsByStyleAndDistance(@NotNull final List<LineNode> lines,
final ParagraphNumberer numberer) {
if (medianVerticalSpacing == -1.0f) {
throw new RuntimeException("set medianVerticalSpacing!");
}
List<ParagraphNode> ret = new ArrayList<ParagraphNode>();
/* separate the lines by their dominant style into paragraphs */
if (!lines.isEmpty()) {
numberer.newParagraph();
ParagraphNode currentParagraph = new ParagraphNode(numberer.getParagraphId(false));
if (SPLIT_PARAGRAPHS_BY_STYLES) {
Style currentStyle = null;
LineNode lastLine = null;
for (LineNode line : lines) {
final Style lineStyle = line.findDominatingStyle();
if (currentStyle == null) {
currentStyle = lineStyle;
lastLine = line;
}
final float distance = line.getPos().y - lastLine.getPos().endY;
final boolean split;
switch (StyleComparator.styleCompare(currentStyle, lineStyle)) {
case SPLIT :
split = true;
break;
case SAME_STYLE_AND_BIG_TEXT :
// split = distance > medianVerticalSpacing * 2.5f;
split = false;
break;
case SAME_STYLE :
/**
* if the styles are similar, only split if there seems to be much space
* between the two lines
*/
split = distance > medianVerticalSpacing * 1.5f;
break;
case SUBTLE_DIFFERENCE :
/* if there is a word with the same style, treat as same */
boolean found = false;
for (WordNode word : line.getChildren()) {
if (word.getStyle().equals(currentStyle)) {
found = true;
}
}
if (found) {
split = distance > medianVerticalSpacing * 1.5f;
} else {
/**
* if the difference is subtle, do split if there seems to be some space
* between the two lines
*/
split = distance > medianVerticalSpacing * 1.1f;
}
break;
case BIG_DIFFERENCE :
found = false;
for (WordNode word : line.getChildren()) {
if (word.getStyle().equals(currentStyle)) {
found = true;
}
}
if (found) {
split = false;
} else {
split = true;
}
break;
default :
throw new RuntimeException("made compiler happy :)");
}
if (split) {
if (!currentParagraph.getChildren().isEmpty()) {
if (log.isDebugEnabled()) {
log.debug(
String.format(
"LOG00660:Split/style: y:%s, "
+ "medianVerticalSpacing: %f, distance: %s, style: %s, %s, line: %s", line
.getPos().y, medianVerticalSpacing, distance, currentStyle,
lineStyle, line));
}
ret.add(currentParagraph);
}
numberer.newParagraph();
currentParagraph = new ParagraphNode(numberer.getParagraphId(false));
currentStyle = lineStyle;
}
currentParagraph.addChild(line);
lastLine = line;
}
} else {
for (LineNode line : lines) {
currentParagraph.addChild(line);
}
}
if (!currentParagraph.getChildren().isEmpty()) {
ret.add(currentParagraph);
}
}
return ret;
}
}