/* * Copyright 2010-2011 Øyvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract.logical.operation; import org.apache.log4j.Logger; import org.elacin.pdfextract.logical.DocumentMetadata; import org.elacin.pdfextract.logical.Operation; import org.elacin.pdfextract.style.Style; import org.elacin.pdfextract.tree.*; import org.jetbrains.annotations.NotNull; import java.util.List; /** * Created by IntelliJ IDEA. User: elacin Date: 31.01.11 Time: 10.46 To change this template use * File | Settings | File Templates. */ public class RecognizeDivs implements Operation { // ------------------------------ FIELDS ------------------------------ private static final Logger log = Logger.getLogger(RecognizeDivs.class); // ------------------------ INTERFACE METHODS ------------------------ // --------------------- Interface Operation --------------------- public void doOperation(@NotNull final DocumentNode root, @NotNull final DocumentMetadata metadata) { List<Style> headerCandidates = metadata.getCandidateHeaderStyles(); Style div1 = null, div2 = null, div3 = null; int divFound = 0; /* identify styles for three levels of divs */ for (PageNode p : root.getChildren()) { for (ParagraphNode prf : p.getChildren()) { Style currentStyle = prf.getStyle(); if (div3 != null) { continue; } if (prf.hasRole()) { continue; } if (!Character.isDigit(prf.getText().charAt(0))) { continue; } if (headerCandidates.contains(currentStyle)) { switch (divFound) { case 0 : div1 = currentStyle; break; case 1 : div2 = currentStyle; break; case 2 : div3 = currentStyle; break; default : assert false; } headerCandidates.remove(currentStyle); divFound++; } } } /* tag matching headline paragraphs with the corresponding role */ for (PageNode p : root.getChildren()) { for (ParagraphNode prf : p.getChildren()) { Style currentStyle = prf.getStyle(); Role r = null; if (!Character.isDigit(prf.getText().charAt(0))) { continue; } if (currentStyle.equals(div1)) { r = Role.DIV1; } else if (currentStyle.equals(div2)) { r = Role.DIV2; } else if (currentStyle.equals(div3)) { r = Role.DIV3; } if (r != null) { prf.addRole(r); } } } } }