/*
* Copyright (C) 2000 - 2012 TagServlet Ltd
*
* This file is part of Open BlueDragon (OpenBD) CFML Server Engine.
*
* OpenBD is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* Free Software Foundation,version 3.
*
* OpenBD is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenBD. If not, see http://www.gnu.org/licenses/
*
* Additional permission under GNU GPL version 3 section 7
*
* If you modify this Program, or any covered work, by linking or combining
* it with any of the JARS listed in the README.txt (or a modified version of
* (that library), containing parts covered by the terms of that JAR, the
* licensors of this Program grant you additional permission to convey the
* resulting work.
* README.txt @ http://www.openbluedragon.org/license/README.txt
*
* http://www.openbluedragon.org/
*
* $Id: FileHandlerMSOfficeImpl.java 2044 2012-05-01 12:38:43Z alan $
*/
package com.bluedragon.search.index.crawl.handler;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import com.bluedragon.search.DocumentWrap;
public class FileHandlerMSOfficeImpl extends AbstractFileHandler {
public FileHandlerMSOfficeImpl(boolean bStoreBody) {
super(bStoreBody);
}
private static Set<String> extensions = new HashSet<String>( Arrays.asList("doc","docx","xls","xlsx","ppt","pptx") );
private static Set<String> mimetypes = new HashSet<String>( Arrays.asList("application/msword","application/msexcel","application/mspowerpoint") );
@Override
public DocumentWrap crawl( String uriroot, File file ) throws CrawlException {
DocumentWrap document = new DocumentWrap();
try{
openFile( file );
ContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
Parser parser;
if ( file.getName().toLowerCase().endsWith("x") )
parser = new OOXMLParser();
else
parser = new OfficeParser();
parser.parse( getFileStream(), textHandler, metadata, parseContext);
document.setAuthor( metadata.get( Metadata.AUTHOR ) );
document.setSummary( metadata.get( Metadata.COMMENTS ) );
document.setContent( textHandler.toString(), bStoreBody );
document.setSize( (int)file.length() );
document.setId( file.getCanonicalPath() );
if ( uriroot != null )
document.setURL( getUrl( uriroot, file ) );
} catch (FileNotFoundException e) {
throw new CrawlException("File not found: " + file, e);
} catch (IOException e) {
throw new CrawlException("File: " + file, e);
} catch (Exception e) {
throw new CrawlException("File: " + file, e);
}finally{
closeFile();
}
return document;
}
public Set<String> getExtensions() {
return extensions;
}
public Set<String> getMimeTypes() {
return mimetypes;
}
}