/* * Copyright 2010-2011 Øyvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract.logical; import org.apache.log4j.Logger; import org.elacin.pdfextract.style.Style; import org.elacin.pdfextract.tree.*; import org.jetbrains.annotations.NotNull; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * This class represents knowledge collected while performing the logical analysis of the document * so the information can be shared between the different operations */ public class DocumentMetadata { // ------------------------------ FIELDS ------------------------------ private static final Logger log = Logger.getLogger(DocumentMetadata.class); private final DocumentNode root; private final Style bodyText; private final List<Style> candidateHeaderStyles; private final Map<Style, Integer> styleCounts; // --------------------------- CONSTRUCTORS --------------------------- public DocumentMetadata(final DocumentNode root) { this.root = root; styleCounts = findStyleCountsFromDocument(root); bodyText = findBodyTextFromStyleCounts(styleCounts); candidateHeaderStyles = findHeaderCandidates(root, bodyText); if (log.isInfoEnabled()) { log.info("LOG01410:bodytext is " + bodyText); log.info("LOG01480:headerCandidates = " + candidateHeaderStyles); } } // -------------------------- STATIC METHODS -------------------------- private static boolean canBeHeaderStyle(@NotNull final Style bodyText, @NotNull final LineNode line) { boolean b = line.getStyle().xSize >= bodyText.xSize; if (b) { log.info("LOG01450:Line " + line + " can be header style"); } return b; } private static boolean canBeLineId(@NotNull final LineNode line, @NotNull final Style bodyText) { boolean fontSame = bodyText.fontName.equals(line.getStyle().fontName); boolean smallerThanBodyText = bodyText.xSize >= line.getStyle().xSize; if (fontSame || smallerThanBodyText) { return false; } final String firstWord = line.getText().trim().split("\\s")[0]; if (firstWord.length() > 3) { return false; } if (Character.isDigit(firstWord.charAt(0)) || firstWord.contains(".") || ("abcdABCI".indexOf(firstWord.charAt(0)) != -1)) { log.warn("LOG01440:Line " + line + " can be line id"); return true; } return false; } @NotNull private static Style findBodyTextFromStyleCounts(final Map<Style, Integer> styleCounts) { Style bodyText = Style.NO_STYLE; int maxCount = Integer.MIN_VALUE; for (Map.Entry<Style, Integer> entry : styleCounts.entrySet()) { if (maxCount < entry.getValue()) { maxCount = entry.getValue(); bodyText = entry.getKey(); } } return bodyText; } /** * create a list of possible styles for headings */ @NotNull private static List<Style> findHeaderCandidates(@NotNull final DocumentNode root, @NotNull final Style bodyText) { List<Style> headerCandidates = new ArrayList<Style>(root.getStyles().size()); for (PageNode page : root.getChildren()) { for (ParagraphNode paragraph : page.getChildren()) { for (LineNode line : paragraph.getChildren()) { Style lineStyle = line.getStyle(); if (headerCandidates.contains(lineStyle) || bodyText.equals(lineStyle)) { continue; } if (canBeHeaderStyle(bodyText, line) || (canBeLineId(line, bodyText))) { headerCandidates.add(lineStyle); } } } } return headerCandidates; } private static Map<Style, Integer> findStyleCountsFromDocument(final DocumentNode root) { Map<Style, Integer> styleCounts = new HashMap<Style, Integer>(root.getStyles().size()); for (int i = 0; i < root.getStyles().size(); i++) { styleCounts.put(root.getStyles().get(i), 0); } for (WordNode word : root.getWords()) { if (!styleCounts.containsKey(word.getStyle())) { continue; } int old = styleCounts.get(word.getStyle()); styleCounts.put(word.getStyle(), old + word.getText().length()); } return styleCounts; } // --------------------- GETTER / SETTER METHODS --------------------- public Style getBodyText() { return bodyText; } public List<Style> getCandidateHeaderStyles() { return candidateHeaderStyles; } public DocumentNode getRoot() { return root; } public Map<Style, Integer> getStyleCounts() { return styleCounts; } }