/*
* DrakkarKeel - An Enterprise Collaborative Search Platform
*
* The contents of this file are subject under the terms described in the
* DRAKKARKEEL_LICENSE file included in this distribution; you may not use this
* file except in compliance with the License.
*
* 2013-2014 DrakkarKeel Platform.
*/
package drakkar.mast.retrieval.parser;
import drakkar.oar.util.OutputMonitor;
import drakkar.mast.retrieval.DocumentLucene;
import drakkar.mast.retrieval.DocumentMinion;
import com.sun.labs.minion.SimpleIndexer;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.util.Calendar;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
/**
* Para extraer de una documento PDF: contenido, título, autor, sumario, número
* de páginas, fecha de creación y de modificación --version de PDFBox 1.0.0
* --nota: Terrier utiliza PDFBox 0.6.7a
*
*
*/
public class PdfParser {
PDDocument pdoc = null;
PDFTextStripper pdfText = null;
StringWriter swriter = null;
PDDocumentInformation pinf = null;
private String title = null;
private String author = null;
private int numberPages = 0;
private Calendar calcreation = null;
private Calendar calmodification = null;
private String allContent;
/**
* Default constructor
*/
public PdfParser() {
}
/**
* Para extraer contenido del pdf
*
* @param f
* @return
*/
public boolean analyzePdfDocument(File f) {
try {
pdoc = PDDocument.load(f);
if (!pdoc.isEncrypted() && pdoc.getCurrentAccessPermission().canExtractContent() && pdoc.getNumberOfPages() != 0) {
this.numberPages = pdoc.getNumberOfPages();
pdfText = new PDFTextStripper();
swriter = new StringWriter();
////////////////////datos
pinf = pdoc.getDocumentInformation();
if (pinf == null) {
OutputMonitor.printLine("The document does not have available information.", OutputMonitor.INFORMATION_MESSAGE);
} else {
setTitle(pinf.getTitle());
setAuthor(pinf.getAuthor());
setNumberpages(pdoc.getNumberOfPages());
setCalCreation(pinf.getCreationDate());
setCalModification(pinf.getModificationDate());
pdfText.writeText(pdoc, swriter);
allContent = swriter.getBuffer().toString();
}
pdoc.close();
swriter.close();
return true;
} else {
OutputMonitor.printLine("Encrypted document.", OutputMonitor.INFORMATION_MESSAGE);
}
} catch (Exception ex) {
OutputMonitor.printStream("", ex);
} finally {
if (pdoc != null) {
try {
pdoc.close();
} catch (IOException ex) {
OutputMonitor.printStream("IO", ex);
}
}
}
return false;
}
/**
* Divide el contenido del pdf de 100 en 100 páginas de acuerdo al número
* total para el motor de búsqueda Minion
*
* @param f
* @param indexer indexador de Minion
* @throws IOException
*/
public void divideTextforMinion(File f, SimpleIndexer indexer) throws IOException {
pdoc = PDDocument.load(f);
this.numberPages = pdoc.getNumberOfPages();
if (!pdoc.isEncrypted() && pdoc.getCurrentAccessPermission().canExtractContent() && pdoc.getNumberOfPages() != 0) {
String fragment = null;
int start = 0, end = 0;
int count = 0;
if (this.numberPages > 100) {
for (int i = 0; i < numberPages; i = i + 100) {
count++; //para el key del document
swriter = new StringWriter();
pdfText = new PDFTextStripper();
start = i;
end = 99 + i;
if (end > numberPages) {
end = numberPages;
}
pdfText.setStartPage(start);
pdfText.setEndPage(end);
pdfText.writeText(pdoc, swriter);
fragment = swriter.getBuffer().toString();
DocumentMinion docm = new DocumentMinion(indexer, f.getPath() + count);
docm.addField("filepath", f.getAbsolutePath());
docm.addField("name", f.getName());
docm.addField("book", fragment);
docm.closeDocument();
swriter.close();
}
pdoc.close();
} else { //si tiene menos de 100 páginas toma todo el texto como está
swriter = new StringWriter();
pdfText = new PDFTextStripper();
pdfText.writeText(pdoc, swriter);
fragment = swriter.getBuffer().toString();
DocumentMinion docm = new DocumentMinion(indexer, f.getPath());
docm.addField("filepath", f.getAbsolutePath());
docm.addField("name", f.getName());
docm.addField("book", fragment);
docm.closeDocument();
swriter.close();
pdoc.close();
}
} else {
OutputMonitor.printLine("Encrypted book.", OutputMonitor.INFORMATION_MESSAGE);
}
if (pdoc != null) {
try {
pdoc.close();
} catch (IOException ex) {
OutputMonitor.printStream("", ex);
}
}
}
/**
* Divide el contenido del pdf de 100 en 100 páginas de acuerdo al número
* total para el motor de búsqueda Lucene
*
* @param f
* @param doccs
* @param doc
* @param doclsi
* @throws IOException
*/
public void divideTextforLucene(File f, DocumentLucene doccs, DocumentLucene doc, DocumentLucene doclsi) throws IOException {
pdoc = PDDocument.load(f);
this.numberPages = pdoc.getNumberOfPages();
if (!pdoc.isEncrypted() && pdoc.getCurrentAccessPermission().canExtractContent() && pdoc.getNumberOfPages() != 0) {
String fragment = null;
int start = 0, end = 0;
int count = 0;
if (this.numberPages > 100) {
for (int i = 0; i < numberPages; i = i + 100) {
count++; //para el key del document
swriter = new StringWriter();
pdfText = new PDFTextStripper();
start = i;
end = 99 + i;
if (end > numberPages) {
end = numberPages;
}
pdfText.setStartPage(start);
pdfText.setEndPage(end);
pdfText.writeText(pdoc, swriter);
fragment = swriter.getBuffer().toString();
doc.addField("filepath", f.getCanonicalPath());
doccs.addField("filepathcs", f.getCanonicalPath());
doc.addField("name", f.getName());
doccs.addField("namecs", f.getName());
doc.addField("book", fragment);
doccs.addField("bookcs", fragment);
///////
if (doclsi != null) {
doclsi.addField("book", fragment);
}
swriter.close();
}
pdoc.close();
} else { //si tiene menos de 100 páginas toma todo el texto como está
swriter = new StringWriter();
pdfText = new PDFTextStripper();
//index
pdfText.writeText(pdoc, swriter);
fragment = swriter.getBuffer().toString();
doc.addField("filepath", f.getCanonicalPath());
doccs.addField("filepathcs", f.getCanonicalPath());
doc.addField("name", f.getName());
doccs.addField("namecs", f.getName());
doc.addField("book", fragment);
doccs.addField("bookcs", fragment);
if (doclsi != null) {
doclsi.addField("book", fragment);
}
swriter.close();
pdoc.close();
}
} else {
OutputMonitor.printLine("Encrypted book.", OutputMonitor.INFORMATION_MESSAGE);
}
if (pdoc != null) {
try {
pdoc.close();
} catch (IOException ex) {
OutputMonitor.printStream("", ex);
}
}
}
///////////////////////////SET y GET
/**
* @return the title
*/
public String getTitle() {
return title;
}
/**
* @param title the title to set
*/
public void setTitle(String title) {
this.title = title;
}
/**
* @return the author
*/
public String getAuthor() {
return author;
}
/**
* @param author the author to set
*/
public void setAuthor(String author) {
this.author = author;
}
/**
* @return the numberpages
*/
public int getNumberpages() {
return numberPages;
}
/**
* @param numberpages the numberpages to set
*/
public void setNumberpages(int numberpages) {
this.numberPages = numberpages;
}
/**
* @return the cal
*/
public Calendar getCalCreation() {
return calcreation;
}
/**
* @param cal the cal to set
*/
public void setCalCreation(Calendar cal) {
this.calcreation = cal;
}
/**
* @return the calmod
*/
public Calendar getCalModification() {
return calmodification;
}
/**
* @param calmod the calmod to set
*/
public void setCalModification(Calendar calmod) {
this.calmodification = calmod;
}
/**
* @return the allContent
*/
public String getAllContent() {
return allContent;
}
/**
* @param allContent the allContent to set
*/
public void setAllContent(String allContent) {
this.allContent = allContent;
}
}