/* ==================================================================
* Created [2009-4-27 下午11:32:55] by Jon.King
* ==================================================================
* TSS
* ==================================================================
* mailTo:jinpujun@hotmail.com
* Copyright (c) Jon.King, 2009-2012
* ==================================================================
*/
package com.jinhe.tss.cms.lucene;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import com.jinhe.tss.core.util.FileHelper;
/**
* 对文章附件内容进行索引。
*
* 用于读取文章的附件内容进行全文检索。
* 附件可能为压缩文件、txt、pdf、word、excel、ppt等多种类型。
*/
public class AttachmentIndex {
private static Logger log = Logger.getLogger(AttachmentIndex.class);
private AttachmentIndex(){
}
private static AttachmentIndex manager;
public static AttachmentIndex getInstance(){
if(manager == null) {
manager = new AttachmentIndex();
}
return manager;
}
/**
* 附件处理得到到附件文本信息(zip文件,解压后取文件列表文本信息)
* @param file
* @return
*/
public String disposeAttachment(File file) {
if ( !file.exists() || !file.isFile() )
return null;
StringBuffer content = new StringBuffer();
// 检测attachment类型如果为zip文件先解压文件,如果普通文件直接解析。
if ( "zip".equals(FileHelper.getFileSuffix(file.getName())) ) {
// 解压压缩包,得到解压后的文件路径
String path = FileHelper.unZip(file, file.getParent());
// 判断解压后的文件列表(如果是 文件,再判断是不是压缩文件。
// 如果是压缩文件就递归。如果是文件夹得到文夹下的文件列表。如果是普通文件直接得到文件信息)
List<String> fileList = getFileList(new File(path));
for ( String fileName : fileList ) {
File tempFile = new File(fileName);
if ( tempFile.exists() && tempFile.isDirectory() ) {
content.append(getFileContent(tempFile));
}
else if (tempFile.exists() && tempFile.isFile()) {
if ("zip".equals(FileHelper.getFileSuffix(tempFile.getName()))) {
content.append(disposeAttachment(tempFile)); // 解压缩递归调用
}
else {
content.append(getContentFromFile(tempFile));
}
}
}
}
else {
content.append(getContentFromFile(file));
}
return content.toString();
}
/**
* 得到文件内容(递归调用)
*/
private String getFileContent(File path) {
if ( path.exists() && path.isFile() )
return getContentFromFile(path);
StringBuffer content = new StringBuffer();
if (path.exists() && path.isDirectory()) {
List<String> fileList = getFileList(path);
for ( String fileName : fileList ) {
content.append(getFileContent(new File(fileName)));
}
}
return content.toString();
}
/**
* 得到文件列表
*/
private List<String> getFileList(File file) {
List<String> returnList = new ArrayList<String>();
List<String> fileList = FileHelper.listFiles(file);
for ( String fileName : fileList ) {
File tempFile = new File(file.getPath() + "/" + fileName.trim());
if ( tempFile.isFile() ) {
returnList.add(tempFile.getPath());
}
else if (tempFile.isDirectory()) {
returnList.addAll(getFileList(tempFile));
}
}
return returnList;
}
/**
* 得到提取附件的文本内容
*/
private String getContentFromFile(File file) {
if (!file.exists() || !file.isFile())
return "";
String suffix = FileHelper.getFileSuffix(file.getName());
if ( "doc".equals(suffix) || "docx".equals(suffix) ) { // word文档
return getContentFromWord(file);
}
else if ( "ppt".equals(suffix) || "pptx".equals(suffix) ) { // ppt文档
return getContentFromPPT(file);
}
else if ( "xls".equals(suffix) || "xlsx".equals(suffix) ) { // excle文档
return getContentFromExcel(file);
}
else if ( "pdf".equals(suffix)) { // pdf文档
return getContentFromPDF(file);
}
else {
return getContentFromText(file); // 普通文档,txt等
}
}
private String getContentFromText(File textFile) {
BufferedReader br = null;
StringBuffer sb = new StringBuffer();
try {
br = new BufferedReader(new FileReader(textFile));
String temp = "";
while (null != temp) {
sb.append(br.readLine()).append("\n");
temp = br.readLine();
}
} catch (Exception e) {
log.error("发布索引时提取文档:" + textFile.getPath() + " 内容失败!", e);
return "";
} finally {
try {
br.close();
} catch (IOException e) {
log.error("发布索引时关闭文件:" + textFile.getPath() + " 流失败!", e);
}
}
return sb.toString();
}
private String getContentFromPDF(File pdfFile) {
PDDocument document = null; // 内存中存储的PDF Document
String content = "";
try {
// 加载pdf文档
document = PDDocument.load(pdfFile);
// PDFTextStripper来提取文本
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(false); // 设置是否排序
stripper.setStartPage( 1 ); // 设置起始页
stripper.setEndPage( Integer.MAX_VALUE ); // 设置结束页
// 调用PDFTextStripper的getText()提取文本信息
content = stripper.getText(document);
} catch (Exception e) {
log.error("发布索引时提取PDF文档:" + pdfFile.getPath() + " 内容失败!", e);
return "";
} finally {
if (document != null) {
// 关闭PDF Document
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return content;
}
private String getContentFromPPT(File pptFile) {
FileInputStream pptInput = null;
PowerPointExtractor pptExtractor;
try {
pptInput = new FileInputStream(pptFile);
pptExtractor = new PowerPointExtractor(pptInput);
} catch(Exception e) {
log.error("发布索引时提取PPT文档:" + pptFile.getPath() + " 内容失败!", e);
return "";
} finally {
try {
pptInput.close();
} catch (IOException e) {
log.error("发布索引时关闭文件:" + pptFile.getPath() + " 流失败!", e);
}
}
return pptExtractor.getText();
}
private String getContentFromWord(File wordFile) {
FileInputStream wordInput = null;
try {
wordInput = new FileInputStream(wordFile);
//word 2003: 图片不会被读取
WordExtractor wordExtractor = new WordExtractor(wordInput);
return wordExtractor.getText();
} catch (Exception e) {
try {
// word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后
OPCPackage opcPackage = POIXMLDocument.openPackage(wordFile.getPath());
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
return extractor.getText();
} catch (Exception ex) {
log.error("发布索引时提取文档:" + wordFile.getPath() + " 内容失败!", e);
return "";
}
} finally {
try {
wordInput.close();
} catch (IOException e) {
log.error("发布索引时关闭文件:" + wordFile.getPath() + " 流失败!", e);
}
}
}
private String getContentFromExcel(File excelFile) {
FileInputStream excelInput = null;
try {
String returnstr = "";
excelInput = new FileInputStream(excelFile);
HSSFWorkbook workBook = new HSSFWorkbook(excelInput);
for (int numSheets = 0; numSheets < workBook.getNumberOfSheets(); numSheets++) {
if (null == workBook.getSheetAt(numSheets))
continue;
HSSFSheet sheet = workBook.getSheetAt(numSheets); // 单页信息
for (int rowNumOfSheet = 0; rowNumOfSheet <= sheet.getLastRowNum(); rowNumOfSheet++) {
if (null == sheet.getRow(rowNumOfSheet))
continue;
HSSFRow row = sheet.getRow(rowNumOfSheet); // 得到行信息
for (int cellNumOfRow = 0; cellNumOfRow <= row.getLastCellNum(); cellNumOfRow++) {
if ( row.getCell(cellNumOfRow) == null)
continue;
HSSFCell cell = row.getCell(cellNumOfRow); // 得到单元格信息
int cellType = cell.getCellType(); // 单元格信息类型
String strCell = null;
switch (cellType) {
case HSSFCell.CELL_TYPE_NUMERIC: // 数字
BigDecimal bd = new BigDecimal(cell.getNumericCellValue());
strCell = bd.toString();
returnstr += strCell;
break;
case HSSFCell.CELL_TYPE_STRING: // 字符串
strCell = cell.toString();
returnstr += strCell;
break;
case HSSFCell.CELL_TYPE_FORMULA: // formula
strCell = String.valueOf(cell.getNumericCellValue());
returnstr += strCell;
break;
case HSSFCell.CELL_TYPE_BLANK: // 无内容
strCell = cell.toString();
returnstr += strCell;
break;
default:
System.out.println("空的EXCLE表格。");
}
}
}
}
return returnstr;
} catch (Exception e) {
log.error("发布索引时提取文档:" + excelFile.getPath() + " 内容失败!", e);
return "";
} finally {
try {
if(excelInput != null) {
excelInput.close();
}
} catch (IOException e) {
log.error("发布索引时关闭文件:" + excelFile.getPath() + " 流失败!", e);
}
}
}
}