/*
* Copyright 2010-2011 Øyvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract.logical.operation;
import org.apache.log4j.Logger;
import org.elacin.pdfextract.logical.DocumentMetadata;
import org.elacin.pdfextract.logical.Operation;
import org.elacin.pdfextract.style.Style;
import org.elacin.pdfextract.tree.DocumentNode;
import org.elacin.pdfextract.tree.PageNode;
import org.elacin.pdfextract.tree.ParagraphNode;
import java.util.List;
/**
* Created by IntelliJ IDEA. User: elacin Date: 31.05.11 Time: 06.25 To change this template use
* File | Settings | File Templates.
*/
public class ExtractTitle implements Operation {
// ------------------------------ FIELDS ------------------------------
private static final Logger log = Logger.getLogger(ExtractTitle.class);
// ------------------------ INTERFACE METHODS ------------------------
// --------------------- Interface Operation ---------------------
public void doOperation(final DocumentNode root, final DocumentMetadata metadata) {
final List<Style> headerCandidates = metadata.getCandidateHeaderStyles();
/* extract title */
PageNode firstPage = root.getChildren().get(0);
List<ParagraphNode> firstPagePrfs = firstPage.getChildren();
for (int i = 0; i < firstPagePrfs.size(); i++) {
final ParagraphNode titleParagraph = firstPagePrfs.get(i);
if (headerCandidates.contains(titleParagraph.getStyle())) {
/* check if the next text logically belongs with this */
if (i + 1 != firstPagePrfs.size() - 1) {
ParagraphNode peekNext = firstPagePrfs.get(i + 1);
if (peekNext.getStyle().equals(titleParagraph.getStyle())) {
firstPage.removeChild(peekNext);
titleParagraph.addChildren(peekNext.getChildren());
}
}
root.setTitle(titleParagraph);
firstPage.removeChild(titleParagraph);
// headerCandidates.remove(titleParagraph.getStyle()); //TODO:does this make sense?
log.warn("LOG01430:Title is " + root.getTitle());
break;
}
}
}
}