/* * Copyright 2010-2011 �yvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract.logical.operation; import org.apache.log4j.Logger; import org.elacin.pdfextract.logical.DocumentMetadata; import org.elacin.pdfextract.logical.Operation; import org.elacin.pdfextract.style.Style; import org.elacin.pdfextract.tree.DocumentNode; import org.elacin.pdfextract.tree.Role; import org.elacin.pdfextract.tree.WordNode; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by IntelliJ IDEA. User: elacin Date: Mar 23, 2010 Time: 3:11:50 AM To change this * template use File | Settings | File Templates. */ public class RecognizeRoles implements Operation { // ------------------------------ FIELDS ------------------------------ private static final Logger log = Logger.getLogger(RecognizeRoles.class); /* these are used to recognize identifiers */ static final Pattern id = Pattern.compile("(?:X\\d{1,2}|\\w{1,2})"); static final Pattern refWithDotPattern = Pattern.compile("\\s*(" + id + "\\s*\\.\\s*\\d?).*", Pattern.DOTALL | Pattern.MULTILINE); static final Pattern numInParenthesisPattern = Pattern.compile("(\\(\\s*" + id + "\\s*\\)).*", Pattern.DOTALL | Pattern.MULTILINE); @Nullable final Style breadtext = null; // ------------------------ INTERFACE METHODS ------------------------ // --------------------- Interface Operation --------------------- public void doOperation(@NotNull final DocumentNode root, final DocumentMetadata metadata) { if (breadtext == null) { log.error("provide breadtext here"); return; } for (WordNode word : root.getWords()) { checkForIdentifier(word); checkForTopNote(word); checkForPageNumber(word); } } // -------------------------- OTHER METHODS -------------------------- void checkForIdentifier(@NotNull final WordNode word) { String mark = null; final String trimmedText = word.text.trim(); if ("".equals(trimmedText)) { return; } // TODO:! if (word.getStyle().equals(breadtext)) { return; } final Matcher matcher = numInParenthesisPattern.matcher(word.text); if (matcher.matches()) { mark = matcher.group(1); } else { final Matcher matcher2 = refWithDotPattern.matcher(word.text); if (matcher2.matches()) { mark = matcher2.group(1); } } /* if the first character is '*' or '-' set that as mark */ // final String firstChar = trimmedText.substring(0, 1); // if ("*-".contains(firstChar)) { // mark = firstChar; // } if (mark != null) { word.addRole(Role.IDENTIFIER); } } private void checkForPageNumber(@NotNull final WordNode word) { boolean isNumber = true; if (((word.text.length() < 5) && word.hasRole(Role.FOOTNOTE)) || word.hasRole(Role.HEADNOTE)) { for (int i = 0; i < word.text.length(); i++) { if (!Character.isDigit(word.text.charAt(i))) { isNumber = false; break; } } if (isNumber) { word.addRole(Role.PAGENUMBER); } } } private void checkForTopNote(@NotNull final WordNode word) { if (word.getPos().y < (word.getPage().getPos().height * 5.0f / 100)) { /* then check the font. we either want smaller than breadtext, or same size but different type */ if ((word.getStyle().ySize < breadtext.ySize) || ((word.getStyle().ySize == breadtext.ySize) &&!word.getStyle().fontName.equals(breadtext.fontName))) { word.addRole(Role.HEADNOTE); } } } }