/* * Copyright 2010-2011 �yvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract.logical.operation; import org.apache.log4j.Logger; import org.elacin.pdfextract.logical.DocumentMetadata; import org.elacin.pdfextract.logical.Operation; import org.elacin.pdfextract.style.StyleDifference; import org.elacin.pdfextract.tree.DocumentNode; import org.elacin.pdfextract.tree.PageNode; import org.elacin.pdfextract.tree.ParagraphNode; import java.util.List; import static org.elacin.pdfextract.style.StyleComparator.styleCompare; /** * Recognizes a heading with the name abstract, and the subsequent paragraphs of text until the next * header-like element. * * Also removes all content before this abstract, so it is essential that ExtractTitle be ran * before this. * */ public class ExtractAbstractAndRemovePreceedingText implements Operation { // ------------------------------ FIELDS ------------------------------ private static final Logger log = Logger.getLogger(ExtractAbstractAndRemovePreceedingText.class); // ------------------------ INTERFACE METHODS ------------------------ // --------------------- Interface Operation --------------------- public void doOperation(final DocumentNode root, final DocumentMetadata metadata) { if (root.getWords().isEmpty() || root.getChildren().isEmpty()) { throw new RuntimeException("tried to analyze empty document"); } PageNode firstPage = root.getChildren().get(0); List<ParagraphNode> prfs = firstPage.getChildren(); for (int i = 0; i < prfs.size(); i++) { final ParagraphNode absTitlePrf = prfs.get(i); if (absTitlePrf.getText().trim().toLowerCase().equals("abstract") && (i + 1 != prfs.size())) { ParagraphNode abstractPrf = prfs.get(++i); i++; while (true) { if (i == prfs.size()) { break; } ParagraphNode next = prfs.get(i); StyleDifference diff = styleCompare(next.getStyle(), abstractPrf.getStyle()); if (diff != StyleDifference.SAME_STYLE) { break; } abstractPrf.addChildren(next.getChildren()); prfs.remove(i); } /* set the newly created paragraph as the special abstract paragraph in the tree, and remove it from the original position* */ root.setAbstractParagraph(abstractPrf); prfs.remove(abstractPrf); prfs.remove(absTitlePrf); /* then remove all preceeding content */ for (int j = 0; j < i -2; j++){ prfs.remove(0); } if (log.isInfoEnabled()) { String t = abstractPrf.getText(); String text = t.substring(0, Math.min(30, t.length())); log.info("LOG01460:Found abstract with text " + text); } } } } }