/*
* Copyright 2010-2011 Øyvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract.physical;
import org.apache.log4j.Logger;
import org.elacin.pdfextract.content.PhysicalContent;
import org.elacin.pdfextract.content.PhysicalPageRegion;
import org.elacin.pdfextract.formula.Formulas;
import org.elacin.pdfextract.geom.Rectangle;
import org.elacin.pdfextract.geom.RectangleCollection;
import org.elacin.pdfextract.style.TextUtils;
import org.jetbrains.annotations.NotNull;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Created by IntelliJ IDEA. User: elacin Date: 18.01.11 Time: 22.21 To change this template use
* File | Settings | File Templates.
*/
public class ContentGrouper {
// ------------------------------ FIELDS ------------------------------
private static final Logger log = Logger.getLogger(ContentGrouper.class);
@NotNull
final List<RectangleCollection> allBlocks = new ArrayList<RectangleCollection>(30);
@NotNull
RectangleCollection currentBlock = new RectangleCollection(
new ArrayList<PhysicalContent>(), null);
@NotNull
final PhysicalPageRegion region;
@NotNull
public final Rectangle rpos;
// --------------------------- CONSTRUCTORS ---------------------------
public ContentGrouper(@NotNull PhysicalPageRegion region) {
this.region = region;
rpos = region.getPos();
}
// -------------------------- PUBLIC METHODS --------------------------
public List<RectangleCollection> findBlocksOfContent() {
/** if this is contained in a grapic, just output the lines */
if (region.isGraphicalRegion()) {
for (PhysicalContent content : region.getContents()) {
if (content.isGraphic() || content.isText()) {
currentBlock.addContent(content);
content.getAssignable().setBlockNum(allBlocks.size());
}
}
allBlocks.add(currentBlock);
return allBlocks;
}
/** do a preliminary formula block combining */
createBlocksForFormulas();
/**
* If not, use the whitespace added to the region to determine blocks of text
*/
/* follow the trails left between the whitespace and construct blocks of text from that */
for (float y = rpos.y; y < rpos.endY; y++) {
final List<PhysicalContent> row = region.findContentAtYIndex(y);
/* iterate through the line to find possible start of blocks */
for (PhysicalContent contentInRow : row) {
if (contentInRow.isAssignable() &&!contentInRow.getAssignable().isAssignedBlock()) {
/* find all connected texts from this */
markEverythingConnectedFrom(contentInRow);
allBlocks.add(currentBlock);
currentBlock = new RectangleCollection(new ArrayList<PhysicalContent>(), null);
}
}
}
if (!currentBlock.getContents().isEmpty()) {
allBlocks.add(currentBlock);
}
return allBlocks;
}
// -------------------------- OTHER METHODS --------------------------
@SuppressWarnings({ "NumericCastThatLosesPrecision" })
private boolean markEverythingConnectedFrom(@NotNull final PhysicalContent content) {
if (!content.isAssignable()) {
return false;
}
if (content.getAssignable().isAssignedBlock()) {
return false;
}
if (content.isGraphic() && content.getGraphicContent().isSeparator()) {
// content.getAssignable().setBlockNum(allBlocks.size());
return false;
}
content.getAssignable().setBlockNum(allBlocks.size());
currentBlock.addContent(content);
if (content.isGraphic()) {
return false;
}
/* try searching for texts in all directions */
int startY = (int) Math.max(rpos.y, content.getPos().y);
int endY = (int) Math.min(rpos.endY, content.getPos().endY);
for (int y = startY + 1; y < endY; y++) {
markBothWaysFromCurrent(content, region.findContentAtYIndex(y));
}
int startX = 1 + (int) Math.max(rpos.x, content.getPos().x);
int endX = -1 + (int) Math.min(rpos.endX, content.getPos().endX);
for (int x = startX; x < endX - 1; x++) {
markBothWaysFromCurrent(content, region.findContentAtXIndex(x));
}
return true;
}
private void createBlocksForFormulas() {
Set<PhysicalContent> workingSet = new HashSet<PhysicalContent>();
boolean skip = false,
hasSkipped = false;
float minX = Float.MAX_VALUE;
float endY = Float.MIN_VALUE;
for (float y = rpos.y; y < rpos.endY; y++) {
final List<PhysicalContent> row = region.findContentAtYIndex(y);
if (!TextUtils.listContainsStyledText(row)) {
workingSet.clear();
skip = false;
minX = Float.MAX_VALUE;
continue;
}
if (skip) {
continue;
}
for (PhysicalContent content : row) {
if (content.isAssignable() &&!workingSet.contains(content)) {
minX = Math.min(content.getPos().x, minX);
endY = Math.max(content.getPos().endY, endY);
workingSet.add(content);
}
}
/* only detect indented formulas */
if (minX < region.getPos().x + 20) {
skip = true;
hasSkipped = true;
continue;
}
/*
* if we found a formula, do hungry block combining of all continous content until
* we find a line which is not
*/
if (Formulas.textSeemsToBeFormula(workingSet)) {
while (y <= endY + 1) {
for (PhysicalContent content : row) {
if (content.isAssignable() &&!workingSet.contains(content)) {
workingSet.add(content);
endY = Math.max(content.getPos().endY, endY);
}
}
y++;
row.clear();
row.addAll(region.findContentAtYIndex(y));
}
for (PhysicalContent content : workingSet) {
if (!content.getAssignable().isAssignedBlock()) {
content.getAssignable().setBlockNum(allBlocks.size());
currentBlock.addContent(content);
}
}
/* if there was no non-formula text inbetween, combine this with the last block */
if (!hasSkipped &&!allBlocks.isEmpty()) {
allBlocks.get(allBlocks.size() - 1).addContents(currentBlock.getContents());
} else {
allBlocks.add(currentBlock);
}
printLastBlock();
currentBlock = new RectangleCollection(new ArrayList<PhysicalContent>(), null);
hasSkipped = false;
} else {
// skip = true;
// hasSkipped = true;
}
}
}
private void markBothWaysFromCurrent(final PhysicalContent current,
@NotNull final List<PhysicalContent> line) {
final int currentIndex = line.indexOf(current);
/* left/up */
for (int index = currentIndex - 1; (index >= 0); index--) {
if (!markEverythingConnectedFrom(line.get(index))){
break;
}
}
/* right / down */
for (int index = currentIndex + 1; (index < line.size()); index++) {
if (!markEverythingConnectedFrom(line.get(index))){
break;
}
}
}
private void printLastBlock() {
StringBuffer sb = new StringBuffer();
List<PhysicalContent> list = allBlocks.get(allBlocks.size() - 1).getContents();
for (PhysicalContent content : list) {
if (content.isText()) {
sb.append(content.getPhysicalText().getText());
}
}
log.info("LOG01370:Created block" + sb);
}
}