/* ================================================================== * Created [2009-4-27 下午11:32:55] by Jon.King * ================================================================== * TSS * ================================================================== * mailTo:jinpujun@hotmail.com * Copyright (c) Jon.King, 2009-2012 * ================================================================== */ package com.jinhe.tss.cms.lucene; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.math.BigDecimal; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Logger; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; import com.jinhe.tss.core.util.FileHelper; /** * 对文章附件内容进行索引。 * * 用于读取文章的附件内容进行全文检索。 * 附件可能为压缩文件、txt、pdf、word、excel、ppt等多种类型。 */ public class AttachmentIndex { private static Logger log = Logger.getLogger(AttachmentIndex.class); private AttachmentIndex(){ } private static AttachmentIndex manager; public static AttachmentIndex getInstance(){ if(manager == null) { manager = new AttachmentIndex(); } return manager; } /** * 附件处理得到到附件文本信息(zip文件,解压后取文件列表文本信息) * @param file * @return */ public String disposeAttachment(File file) { if ( !file.exists() || !file.isFile() ) return null; StringBuffer content = new StringBuffer(); // 检测attachment类型如果为zip文件先解压文件,如果普通文件直接解析。 if ( "zip".equals(FileHelper.getFileSuffix(file.getName())) ) { // 解压压缩包,得到解压后的文件路径 String path = FileHelper.unZip(file, file.getParent()); // 判断解压后的文件列表(如果是 文件,再判断是不是压缩文件。 // 如果是压缩文件就递归。如果是文件夹得到文夹下的文件列表。如果是普通文件直接得到文件信息) List<String> fileList = getFileList(new File(path)); for ( String fileName : fileList ) { File tempFile = new File(fileName); if ( tempFile.exists() && tempFile.isDirectory() ) { content.append(getFileContent(tempFile)); } else if (tempFile.exists() && tempFile.isFile()) { if ("zip".equals(FileHelper.getFileSuffix(tempFile.getName()))) { content.append(disposeAttachment(tempFile)); // 解压缩递归调用 } else { content.append(getContentFromFile(tempFile)); } } } } else { content.append(getContentFromFile(file)); } return content.toString(); } /** * 得到文件内容(递归调用) */ private String getFileContent(File path) { if ( path.exists() && path.isFile() ) return getContentFromFile(path); StringBuffer content = new StringBuffer(); if (path.exists() && path.isDirectory()) { List<String> fileList = getFileList(path); for ( String fileName : fileList ) { content.append(getFileContent(new File(fileName))); } } return content.toString(); } /** * 得到文件列表 */ private List<String> getFileList(File file) { List<String> returnList = new ArrayList<String>(); List<String> fileList = FileHelper.listFiles(file); for ( String fileName : fileList ) { File tempFile = new File(file.getPath() + "/" + fileName.trim()); if ( tempFile.isFile() ) { returnList.add(tempFile.getPath()); } else if (tempFile.isDirectory()) { returnList.addAll(getFileList(tempFile)); } } return returnList; } /** * 得到提取附件的文本内容 */ private String getContentFromFile(File file) { if (!file.exists() || !file.isFile()) return ""; String suffix = FileHelper.getFileSuffix(file.getName()); if ( "doc".equals(suffix) || "docx".equals(suffix) ) { // word文档 return getContentFromWord(file); } else if ( "ppt".equals(suffix) || "pptx".equals(suffix) ) { // ppt文档 return getContentFromPPT(file); } else if ( "xls".equals(suffix) || "xlsx".equals(suffix) ) { // excle文档 return getContentFromExcel(file); } else if ( "pdf".equals(suffix)) { // pdf文档 return getContentFromPDF(file); } else { return getContentFromText(file); // 普通文档,txt等 } } private String getContentFromText(File textFile) { BufferedReader br = null; StringBuffer sb = new StringBuffer(); try { br = new BufferedReader(new FileReader(textFile)); String temp = ""; while (null != temp) { sb.append(br.readLine()).append("\n"); temp = br.readLine(); } } catch (Exception e) { log.error("发布索引时提取文档:" + textFile.getPath() + " 内容失败!", e); return ""; } finally { try { br.close(); } catch (IOException e) { log.error("发布索引时关闭文件:" + textFile.getPath() + " 流失败!", e); } } return sb.toString(); } private String getContentFromPDF(File pdfFile) { PDDocument document = null; // 内存中存储的PDF Document String content = ""; try { // 加载pdf文档 document = PDDocument.load(pdfFile); // PDFTextStripper来提取文本 PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(false); // 设置是否排序 stripper.setStartPage( 1 ); // 设置起始页 stripper.setEndPage( Integer.MAX_VALUE ); // 设置结束页 // 调用PDFTextStripper的getText()提取文本信息 content = stripper.getText(document); } catch (Exception e) { log.error("发布索引时提取PDF文档:" + pdfFile.getPath() + " 内容失败!", e); return ""; } finally { if (document != null) { // 关闭PDF Document try { document.close(); } catch (IOException e) { e.printStackTrace(); } } } return content; } private String getContentFromPPT(File pptFile) { FileInputStream pptInput = null; PowerPointExtractor pptExtractor; try { pptInput = new FileInputStream(pptFile); pptExtractor = new PowerPointExtractor(pptInput); } catch(Exception e) { log.error("发布索引时提取PPT文档:" + pptFile.getPath() + " 内容失败!", e); return ""; } finally { try { pptInput.close(); } catch (IOException e) { log.error("发布索引时关闭文件:" + pptFile.getPath() + " 流失败!", e); } } return pptExtractor.getText(); } private String getContentFromWord(File wordFile) { FileInputStream wordInput = null; try { wordInput = new FileInputStream(wordFile); //word 2003: 图片不会被读取 WordExtractor wordExtractor = new WordExtractor(wordInput); return wordExtractor.getText(); } catch (Exception e) { try { // word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后 OPCPackage opcPackage = POIXMLDocument.openPackage(wordFile.getPath()); POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); return extractor.getText(); } catch (Exception ex) { log.error("发布索引时提取文档:" + wordFile.getPath() + " 内容失败!", e); return ""; } } finally { try { wordInput.close(); } catch (IOException e) { log.error("发布索引时关闭文件:" + wordFile.getPath() + " 流失败!", e); } } } private String getContentFromExcel(File excelFile) { FileInputStream excelInput = null; try { String returnstr = ""; excelInput = new FileInputStream(excelFile); HSSFWorkbook workBook = new HSSFWorkbook(excelInput); for (int numSheets = 0; numSheets < workBook.getNumberOfSheets(); numSheets++) { if (null == workBook.getSheetAt(numSheets)) continue; HSSFSheet sheet = workBook.getSheetAt(numSheets); // 单页信息 for (int rowNumOfSheet = 0; rowNumOfSheet <= sheet.getLastRowNum(); rowNumOfSheet++) { if (null == sheet.getRow(rowNumOfSheet)) continue; HSSFRow row = sheet.getRow(rowNumOfSheet); // 得到行信息 for (int cellNumOfRow = 0; cellNumOfRow <= row.getLastCellNum(); cellNumOfRow++) { if ( row.getCell(cellNumOfRow) == null) continue; HSSFCell cell = row.getCell(cellNumOfRow); // 得到单元格信息 int cellType = cell.getCellType(); // 单元格信息类型 String strCell = null; switch (cellType) { case HSSFCell.CELL_TYPE_NUMERIC: // 数字 BigDecimal bd = new BigDecimal(cell.getNumericCellValue()); strCell = bd.toString(); returnstr += strCell; break; case HSSFCell.CELL_TYPE_STRING: // 字符串 strCell = cell.toString(); returnstr += strCell; break; case HSSFCell.CELL_TYPE_FORMULA: // formula strCell = String.valueOf(cell.getNumericCellValue()); returnstr += strCell; break; case HSSFCell.CELL_TYPE_BLANK: // 无内容 strCell = cell.toString(); returnstr += strCell; break; default: System.out.println("空的EXCLE表格。"); } } } } return returnstr; } catch (Exception e) { log.error("发布索引时提取文档:" + excelFile.getPath() + " 内容失败!", e); return ""; } finally { try { if(excelInput != null) { excelInput.close(); } } catch (IOException e) { log.error("发布索引时关闭文件:" + excelFile.getPath() + " 流失败!", e); } } } }