AttachmentIndex.java example

Explorer
jinhe-tss-master
/* ==================================================================   
 * Created [2009-4-27 下午11:32:55] by Jon.King 
 * ==================================================================  
 * TSS 
 * ================================================================== 
 * mailTo:jinpujun@hotmail.com
 * Copyright (c) Jon.King, 2009-2012 
 * ================================================================== 
 */

package com.jinhe.tss.cms.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;

import org.apache.log4j.Logger;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

import com.jinhe.tss.core.util.FileHelper;

/**
 * 对文章附件内容进行索引。
 * 
 * 用于读取文章的附件内容进行全文检索。
 * 附件可能为压缩文件、txt、pdf、word、excel、ppt等多种类型。
 */
public class AttachmentIndex {
    
    private static Logger log = Logger.getLogger(AttachmentIndex.class);

    private AttachmentIndex(){
    }

    private static AttachmentIndex manager;
    
    public static AttachmentIndex getInstance(){
        if(manager == null) {
            manager = new AttachmentIndex();
        }
        return manager;
    }
    
	/**
	 * 附件处理得到到附件文本信息(zip文件，解压后取文件列表文本信息)
	 * @param file
	 * @return
	 */
	public String disposeAttachment(File file) {
        if ( !file.exists() || !file.isFile() ) 
            return null;
        
		StringBuffer content = new StringBuffer();
		
		// 检测attachment类型如果为zip文件先解压文件，如果普通文件直接解析。
		if ( "zip".equals(FileHelper.getFileSuffix(file.getName())) ) {
			
            // 解压压缩包，得到解压后的文件路径
            String path = FileHelper.unZip(file, file.getParent());

			// 判断解压后的文件列表（如果是 文件，再判断是不是压缩文件。
            // 如果是压缩文件就递归。如果是文件夹得到文夹下的文件列表。如果是普通文件直接得到文件信息）
			List<String> fileList = getFileList(new File(path));
			for ( String fileName : fileList ) {
				File tempFile = new File(fileName);
				if ( tempFile.exists() && tempFile.isDirectory() ) {
					content.append(getFileContent(tempFile));
				} 
				else if (tempFile.exists() && tempFile.isFile()) {
					if ("zip".equals(FileHelper.getFileSuffix(tempFile.getName()))) {
						content.append(disposeAttachment(tempFile)); // 解压缩递归调用
					} 
					else {
						content.append(getContentFromFile(tempFile));
					}
				}
			}
		}
		else {
			content.append(getContentFromFile(file));
		}
		return content.toString();
	}

	/**
	 * 得到文件内容（递归调用）
	 */
	private String getFileContent(File path) {
		if ( path.exists() && path.isFile() ) 
			return getContentFromFile(path);
        
        StringBuffer content = new StringBuffer();
        if (path.exists() && path.isDirectory()) {
			List<String> fileList = getFileList(path);
			for ( String fileName : fileList ) {
				content.append(getFileContent(new File(fileName)));
			}
		}
		return content.toString();
	}

	/**
	 * 得到文件列表
	 */
	private List<String> getFileList(File file) {
		List<String> returnList = new ArrayList<String>();
		
		List<String> fileList = FileHelper.listFiles(file);
		for ( String fileName : fileList ) {
			File tempFile = new File(file.getPath() + "/" + fileName.trim());
			if ( tempFile.isFile() ) {
				returnList.add(tempFile.getPath());
			} 
			else if (tempFile.isDirectory()) {
                returnList.addAll(getFileList(tempFile));
			}
		}
		return returnList;
	}

	/**
	 * 得到提取附件的文本内容
	 */
	private String getContentFromFile(File file) {
        if (!file.exists() || !file.isFile())
            return "";
        
		String suffix = FileHelper.getFileSuffix(file.getName());
		if ( "doc".equals(suffix) || "docx".equals(suffix) ) { // word文档
            return getContentFromWord(file);
		} 
		else if ( "ppt".equals(suffix) || "pptx".equals(suffix) ) { // ppt文档
            return getContentFromPPT(file);
		} 
		else if ( "xls".equals(suffix) || "xlsx".equals(suffix) ) { // excle文档
            return getContentFromExcel(file);
		} 
		else if ( "pdf".equals(suffix)) { // pdf文档
            return getContentFromPDF(file);
		}
		else {
            return getContentFromText(file); // 普通文档，txt等
		}
	}
	
    private String getContentFromText(File textFile) {
        BufferedReader br = null;
        StringBuffer sb = new StringBuffer();
        try {
            br = new BufferedReader(new FileReader(textFile));
            String temp = "";
            while (null != temp) {
                sb.append(br.readLine()).append("\n");
                temp = br.readLine();
            }
        } catch (Exception e) {
            log.error("发布索引时提取文档:" + textFile.getPath() + " 内容失败！", e);
            return "";
        } finally {
            try {
                br.close();
            } catch (IOException e) {
                log.error("发布索引时关闭文件:" + textFile.getPath() + " 流失败！", e);
            }
        }
        return sb.toString();
    }
    
    private String getContentFromPDF(File pdfFile) {
        PDDocument document = null;       // 内存中存储的PDF Document
        String content = "";
        try {
            // 加载pdf文档
            document = PDDocument.load(pdfFile);
            
            // PDFTextStripper来提取文本
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setSortByPosition(false); // 设置是否排序
            stripper.setStartPage( 1 );  // 设置起始页
            stripper.setEndPage( Integer.MAX_VALUE ); // 设置结束页
            
            // 调用PDFTextStripper的getText()提取文本信息
            content = stripper.getText(document);
            
        } catch (Exception e) {
            log.error("发布索引时提取PDF文档:" + pdfFile.getPath() + " 内容失败！", e);
            return "";
        } finally {
            if (document != null) {
                // 关闭PDF Document
                try {
                    document.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return content;
    }

    private String getContentFromPPT(File pptFile) {
        FileInputStream pptInput = null;
        PowerPointExtractor pptExtractor;
        try {
            pptInput = new FileInputStream(pptFile);
            pptExtractor = new PowerPointExtractor(pptInput);
            
        } catch(Exception e) {
            log.error("发布索引时提取PPT文档:" + pptFile.getPath() + " 内容失败！", e);
            return "";
        } finally {
            try {
                pptInput.close();
            } catch (IOException e) {
                log.error("发布索引时关闭文件:" + pptFile.getPath() + " 流失败！", e);
            }
        }
        return pptExtractor.getText();
    }

    private String getContentFromWord(File wordFile) {
        FileInputStream wordInput = null;
        try {
            wordInput = new FileInputStream(wordFile);
            
            //word 2003： 图片不会被读取  
            WordExtractor wordExtractor = new WordExtractor(wordInput);
            return wordExtractor.getText();
        } catch (Exception e) {
            try {
               // word 2007 图片不会被读取， 表格中的数据会被放在字符串的最后   
               OPCPackage opcPackage = POIXMLDocument.openPackage(wordFile.getPath());  
               POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);  
               return extractor.getText();  
            } catch (Exception ex) {
                log.error("发布索引时提取文档:" + wordFile.getPath() + " 内容失败！", e);
                return "";
            }
        } finally {
            try {
                wordInput.close();
            } catch (IOException e) {
                log.error("发布索引时关闭文件:" + wordFile.getPath() + " 流失败！", e);
            }
        }
       
    }

    private String getContentFromExcel(File excelFile) {
        FileInputStream excelInput = null;
        try {
            String returnstr = "";
            excelInput = new FileInputStream(excelFile);
            HSSFWorkbook workBook = new HSSFWorkbook(excelInput);
            for (int numSheets = 0; numSheets < workBook.getNumberOfSheets(); numSheets++) {
                if (null == workBook.getSheetAt(numSheets))
                    continue;
                
                HSSFSheet sheet = workBook.getSheetAt(numSheets); // 单页信息
                for (int rowNumOfSheet = 0; rowNumOfSheet <= sheet.getLastRowNum(); rowNumOfSheet++) {
                    if (null == sheet.getRow(rowNumOfSheet)) 
                        continue;

                    HSSFRow row = sheet.getRow(rowNumOfSheet); // 得到行信息
                    for (int cellNumOfRow = 0; cellNumOfRow <= row.getLastCellNum(); cellNumOfRow++) {
                        if ( row.getCell(cellNumOfRow) == null) 
                            continue;

                        HSSFCell cell = row.getCell(cellNumOfRow); // 得到单元格信息
                        int cellType = cell.getCellType(); // 单元格信息类型
                        String strCell = null;
                        switch (cellType) {
                        case HSSFCell.CELL_TYPE_NUMERIC: // 数字
                            BigDecimal bd = new BigDecimal(cell.getNumericCellValue());
                            strCell = bd.toString();
                            returnstr += strCell;
                            break;
                        case HSSFCell.CELL_TYPE_STRING: // 字符串
                            strCell = cell.toString();
                            returnstr += strCell;
                            break;
                        case HSSFCell.CELL_TYPE_FORMULA: // formula
                            strCell = String.valueOf(cell.getNumericCellValue());
                            returnstr += strCell;
                            break;
                        case HSSFCell.CELL_TYPE_BLANK: // 无内容
                            strCell = cell.toString();
                            returnstr += strCell;
                            break;
                        default:
                            System.out.println("空的EXCLE表格。");
                        }
                    }
                }
            }
            return returnstr;
        } catch (Exception e) {
            log.error("发布索引时提取文档:" + excelFile.getPath() + " 内容失败！", e);
            return "";
        } finally {
            try {
            	if(excelInput != null) {
            		excelInput.close();
            	}
            } catch (IOException e) {
                log.error("发布索引时关闭文件:" + excelFile.getPath() + " 流失败！", e);
            }
        }
    }
}