/*
* Copyright 2010-2011 �yvind Berg (elacin@gmail.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.elacin.pdfextract.logical.operation;
import org.apache.log4j.Logger;
import org.elacin.pdfextract.logical.DocumentMetadata;
import org.elacin.pdfextract.logical.Operation;
import org.elacin.pdfextract.style.Style;
import org.elacin.pdfextract.style.TextUtils;
import org.elacin.pdfextract.tree.DocumentNode;
import org.elacin.pdfextract.tree.PageNode;
import org.elacin.pdfextract.tree.ParagraphNode;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import static org.elacin.pdfextract.style.TextUtils.findDominatingStyle;
/**
* Created by IntelliJ IDEA. User: elacin Date: 24.05.11 Time: 11.26 To change this template use
* File | Settings | File Templates.
*/
public class RemovePageNumbers implements Operation {
// ------------------------------ FIELDS ------------------------------
private static final Logger log = Logger.getLogger(RemovePageNumbers.class);
// ------------------------ INTERFACE METHODS ------------------------
// --------------------- Interface Operation ---------------------
public void doOperation(final DocumentNode root, final DocumentMetadata metadata) {
/* make sure we only find max one page number per page */
final int size = 2 * root.getChildren().size();
List<List<ParagraphNode>> potPageNumbersForPage = new ArrayList<List<ParagraphNode>>(size);
List<ParagraphNode> allPotPageNumbers = new ArrayList<ParagraphNode>(size);
for (PageNode page : root.getChildren()) {
final List<ParagraphNode> currentPagePotPageNum = new ArrayList<ParagraphNode>();
for (ParagraphNode prf : page.getChildren()) {
if (prf.hasRole()) {
continue;
}
/* look for one word paragraphs */
if (prf.getChildren().size() != 1) {
continue;
}
/* only look for page numbers in the lower 15% of the page */
if (prf.getPos().y < page.getPos().endY * 0.85f) {
continue;
}
if (isAllDigit(prf.getText().trim())) {
currentPagePotPageNum.add(prf);
}
}
if (currentPagePotPageNum.isEmpty()){
continue;
}
potPageNumbersForPage.add(currentPagePotPageNum);
allPotPageNumbers.addAll(currentPagePotPageNum);
}
if (log.isInfoEnabled()) {
log.info("LOG01540:potential page numbers :" + potPageNumbersForPage);
}
if (potPageNumbersForPage.size() < Math.max(1, root.getChildren().size() / 2)) {
if (log.isInfoEnabled()) {
log.info("LOG01560:Could not find page numbers");
}
return;
}
Style mostProbablePageNumStyle = findDominatingStyle(allPotPageNumbers);
if (log.isInfoEnabled()) {
log.info("LOG01550:mostProbablePageNumStyle" + mostProbablePageNumStyle);
}
for (List<ParagraphNode> pageNumCandidatesForPage : potPageNumbersForPage) {
/* remove the page number candidates which has the wrong style */
for (Iterator<ParagraphNode> iterator = pageNumCandidatesForPage.iterator(); iterator.hasNext(); ) {
final ParagraphNode pageNumCandidate = iterator.next();
if (!pageNumCandidate.getStyle().equals(mostProbablePageNumStyle)) {
iterator.remove();
}
}
if (pageNumCandidatesForPage.isEmpty()){
log.warn("LOG01570:No page numbers left after checking style");
continue;
} else if (pageNumCandidatesForPage.size() > 1){
log.warn("LOG01570:Found several possible page numbers for a page:" + pageNumCandidatesForPage);
continue;
}
ParagraphNode pageNumToRemove = pageNumCandidatesForPage.get(0);
log.warn("LOG01580:Removing page number " + pageNumToRemove);
pageNumToRemove.getParent().removeChild(pageNumToRemove);
}
}
// -------------------------- STATIC METHODS --------------------------
private static boolean isAllDigit(final String text) {
for (int i = 0; i < text.length(); i++) {
if (!Character.isDigit(text.charAt(i))) {
return false;
}
}
return true;
}
}