/*
* Copyright 2010-2011 Øyvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract.physical.line;
import org.apache.log4j.Logger;
import org.elacin.pdfextract.content.PhysicalContent;
import org.elacin.pdfextract.content.PhysicalText;
import org.elacin.pdfextract.geom.Rectangle;
import org.elacin.pdfextract.geom.RectangleCollection;
import org.elacin.pdfextract.geom.Sorting;
import org.elacin.pdfextract.tree.LineNode;
import org.elacin.pdfextract.tree.WordNode;
import org.jetbrains.annotations.NotNull;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static org.elacin.pdfextract.content.AssignablePhysicalContent.BLOCK_NOT_ASSIGNED;
public class LineSegmentator {
// ------------------------------ FIELDS ------------------------------
public static final int LIMIT = 1;
public static final int LOOKAHEAD = 2;
private static final Logger log = Logger.getLogger(LineSegmentator.class);
// -------------------------- PUBLIC STATIC METHODS --------------------------
@NotNull
public static List<LineNode> createLinesFromBlocks(RectangleCollection block) {
/* compile paragraphs of text based on the assigned block numbers */
int minY = Integer.MAX_VALUE,
maxY = Integer.MIN_VALUE;
/* reuse the assignment numbers */
for (PhysicalContent content : block.getContents()) {
content.getAssignable().setBlockNum(BLOCK_NOT_ASSIGNED);
minY = Math.min((int) content.getPos().y, minY);
maxY = Math.max((int) content.getPos().endY, maxY);
}
maxY++; // account for rounding
final int blockHeight = maxY - minY;
final List<LineNode> lines = new ArrayList<LineNode>();
int[] counts = new int[blockHeight];
for (PhysicalContent content : block.getContents()) {
if (!content.isAssignable()) {
continue;
}
int contentHeight = (int) content.getPos().height;
int contentStart = (int) content.getPos().y;
int contentWidth = (int) content.getPos().width;
for (int contentY = 0; contentY < contentHeight; contentY++) {
counts[contentStart + contentY - minY] += contentWidth;
}
}
List<Integer> lineBoundaries = findLineBoundaries(counts);
Collections.sort(block.getContents(), Sorting.sortByLowerY);
LineNode currentLine = new LineNode();
for (int i = 0; i < lineBoundaries.size() - 1; i++) {
int start = minY + lineBoundaries.get(i) - 1;
int stop = minY + lineBoundaries.get(i + 1);
for (PhysicalContent content : block.getContents()) {
final Rectangle contentPos = content.getPos();
if (content.getAssignable().isAssignedBlock()) {
continue;
}
if ((contentPos.y > start - 1) && (contentPos.endY < stop + 1)) {
content.getAssignable().setBlockNum(1);
if (content.isText()) {
currentLine.addChild(createWordNode(content.getPhysicalText()));
} else {
currentLine.addChild(createWordNodeFromGraphic(content));
}
}
}
if (!currentLine.getChildren().isEmpty()) {
lines.add(currentLine);
currentLine = new LineNode();
}
}
combineLines(lines);
return lines;
}
@NotNull
public static WordNode createWordNode(@NotNull final PhysicalText text) {
return new WordNode(text.getPos(), text.getStyle(), text.text, text.charSpacing);
}
public static WordNode createWordNodeFromGraphic(final PhysicalContent content) {
return new WordNode(content.getPos(), content.getGraphicContent().getStyle(),
content.getGraphicContent().getStyle().id, -1);
}
// -------------------------- STATIC METHODS --------------------------
/**
* Some times very small punctuation like full stops and commas seems to be left on a line on
* their own. work around that here
*
* @param lines
*/
private static void combineLines(final List<LineNode> lines) {
for (int i = 1; i < lines.size(); i++) {
final LineNode lastLine = lines.get(i - 1);
final LineNode currentLine = lines.get(i);
if (currentLine.getChildren().size() >= 4) {
continue;
}
if (!currentLine.getStyle().equals(lastLine.getStyle())) {
continue;
}
if (currentLine.getPos().height > 0.7f * lastLine.getPos().height) {
continue;
}
if (lastLine.getPos().getVerticalDistanceTo(currentLine.getPos()) > 2) {
continue;
}
if (log.isDebugEnabled()) {
log.debug("LOG01360:Combining line " + currentLine);
}
lastLine.addChildren(currentLine.getChildren());
lines.remove(currentLine);
i--;
}
}
@NotNull
private static List<Integer> findLineBoundaries(@NotNull int[] counts) {
List<Integer> lineBoundaries = new ArrayList<Integer>();
lineBoundaries.add(0);
boolean hasFoundText = false;
for (int i = 0; i < counts.length; i++) {
if (hasFoundText && (counts[i] < LOOKAHEAD)) {
boolean isBoundary = true;
for (int j = i + 1; (j < i + LOOKAHEAD) && (j < counts.length); j++) {
if (counts[j] <= LIMIT) {
isBoundary = false;
break;
}
}
if (isBoundary) {
lineBoundaries.add(i + 1);
hasFoundText = false;
}
} else if (counts[i] > LIMIT) {
hasFoundText = true;
}
}
/* add the end as well */
lineBoundaries.add(counts.length);
return lineBoundaries;
}
}