/*
* Copyright 2010-2011 Øyvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract.physical;
import org.apache.log4j.Logger;
import org.apache.log4j.MDC;
import org.elacin.pdfextract.Constants;
import org.elacin.pdfextract.content.PhysicalPage;
import org.elacin.pdfextract.content.PhysicalText;
import org.elacin.pdfextract.datasource.DocumentContent;
import org.elacin.pdfextract.datasource.PageContent;
import org.elacin.pdfextract.physical.word.WordSegmentator;
import org.elacin.pdfextract.physical.word.WordSegmentatorImpl;
import org.elacin.pdfextract.tree.DocumentNode;
import org.elacin.pdfextract.tree.PageNode;
import java.util.List;
/**
* Created by IntelliJ IDEA. User: elacin Date: 29.01.11 Time: 20.09 To change this template use
* File | Settings | File Templates.
*/
public class GeometricAnalysis {
private static final Logger log = Logger.getLogger(GeometricAnalysis.class);
public static final WordSegmentator wordSegmentator = new WordSegmentatorImpl();
public static DocumentNode analyzeDocument(final DocumentContent content) {
DocumentNode root = new DocumentNode();
final long t0 = System.currentTimeMillis();
root.getStyles().addAll(content.getStyles());
for (final PageContent inputPage : content.getPages()) {
MDC.put("page", inputPage.getPageNum());
if (inputPage.getCharacters().isEmpty()) {
log.error("LOG01150:Page " + inputPage.getPageNum() + " is empty");
continue;
}
final List<PhysicalText> words = wordSegmentator.segmentWords(inputPage.getCharacters());
/* create a physical page instance */
PhysicalPage pp = new PhysicalPage(words, inputPage.getGraphics(), inputPage.getPageNum(),
inputPage.getDimensions());
/* divide the page in smaller sections */
final PageNode pageNode = PageSegmentator.analyzePage(pp);
if (Constants.RENDER_ENABLED) {
pageNode.setPhysicalPage(pp);
}
root.addChild(pageNode);
}
MDC.remove("page");
final long td = System.currentTimeMillis() - t0;
log.info("Analyzed " + content.getPages().size() + " pages in " + td + "ms");
return root;
}
}