/* * Copyright 2010-2011 Øyvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract.logical; import org.apache.log4j.Logger; import org.elacin.pdfextract.logical.operation.*; import org.elacin.pdfextract.tree.DocumentNode; /** * Created by IntelliJ IDEA. User: elacin Date: 31.05.11 Time: 06.38 To change this template use * File | Settings | File Templates. */ public class LogicalAnalysis { // ------------------------------ FIELDS ------------------------------ private static final Logger log = Logger.getLogger(LogicalAnalysis.class); // -------------------------- PUBLIC STATIC METHODS -------------------------- public static void analyzeDocument(final DocumentNode root, final boolean arc) { if (root.getWords().isEmpty() || root.getChildren().isEmpty()) { log.warn("LOG01590:tried to analyze empty document"); return; } final DocumentMetadata metadata = new DocumentMetadata(root); new ExtractTitle().doOperation(root, metadata); new RemovePageNumbers().doOperation(root, metadata); if (arc) { new ExtractFootnotes().doOperation(root, metadata); new ExtractAbstractAndRemovePreceedingText().doOperation(root, metadata); } new RecognizeDivs().doOperation(root, metadata); } }