/* * Copyright (c) 2013 Allogy Interactive. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.allogy.app.media; import java.util.ArrayList; import com.hsl.txtreader.DocFile; /** * @author Corey Cowart */ public class PDFBook extends EBook { private int numPages; private int bookLength; private ArrayList<String> TabOfConts; private ArrayList<String> Sections; public PDFBook (String fileName) { //the DocFile class allows quick access to data such as the number of pages in a document DocFile docfile = new DocFile(fileName); numPages = docfile.getNumPages(); TabOfConts = new ArrayList<String>(); Sections = new ArrayList<String>(); bookLength = 0; /*First, this loop populates the table of contents with the corresponding page numbers * since the current code cannot assess the meta data of the pdf file to determine inherent * section titles * * Second, it extracts the text from each page individually and populates the bookContents * array list with all of the actual text from the pdf file * * Finally, it accumulates the length of each section to get an overall book length. */ for (int i = 1; i <= numPages; i++){ String currentSection = "Page " + i; TabOfConts.add(currentSection); StringBuffer content = docfile.getPageContent(i); String noTags = removeTags(content); Sections.add(noTags); bookLength += noTags.length(); } } /* *Since the PDF library being used translates the PDFs into html rather than just stripping the plain *text out of them, this method is used to remove the html tags and introduce line breaks where needed. */ public static String removeTags (StringBuffer mContentStringBuffer){ String originalString = mContentStringBuffer.toString(); String newString = ""; String currentTag = ""; for (int i =0; i < originalString.length(); i++){ String currentChar = ""+originalString.charAt(i); if (originalString.charAt(i) == '<'){ i++; while(originalString.charAt(i) != '>'){ String currentTagChar = ""+originalString.charAt(i); currentTag = currentTag+currentTagChar; i++; } if (currentTag.equals("br")){ newString = newString+"\n"; } currentTag = ""; } else{ String currentRealChar = ""+originalString.charAt(i); newString = newString+currentRealChar; } } newString += "\n\n"; return newString; } /* * Returns the Array list that contains all of the pdf's text */ public ArrayList<String> getSections() { return Sections; } /* * Returns the pdf's table of contents */ protected ArrayList<String> getTabOfConts() { return TabOfConts; } }