/*
* Copyright (C) 2000 - 2011 TagServlet Ltd
*
* This file is part of Open BlueDragon (OpenBD) CFML Server Engine.
*
* OpenBD is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* Free Software Foundation,version 3.
*
* OpenBD is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenBD. If not, see http://www.gnu.org/licenses/
*
* Additional permission under GNU GPL version 3 section 7
*
* If you modify this Program, or any covered work, by linking or combining
* it with any of the JARS listed in the README.txt (or a modified version of
* (that library), containing parts covered by the terms of that JAR, the
* licensors of this Program grant you additional permission to convey the
* resulting work.
* README.txt @ http://www.openbluedragon.org/license/README.txt
*
* http://www.openbluedragon.org/
*
* $Id: FileHandlerHTMLImpl.java 1638 2011-07-31 16:08:50Z alan $
*/
package com.bluedragon.search.index.crawl.handler;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.bluedragon.search.DocumentWrap;
public class FileHandlerHTMLImpl extends AbstractFileHandler {
private static Set<String> extensions = new HashSet<String>( Arrays.asList("htm","html","cfm","cfml","cfc","asp","aspx","php","jsp","jspx") );
private static Set<String> mimetypes = new HashSet<String>( Arrays.asList("text/html") );
private Set<String> anchors;
public FileHandlerHTMLImpl(boolean bStoreBody) {
super(bStoreBody);
anchors = new HashSet<String>();
}
public Set<String> getExtensions() {
return extensions;
}
public Set<String> getMimeTypes() {
return mimetypes;
}
public Object getExtra(){
return anchors;
}
public DocumentWrap crawl(String uriroot, File file) throws CrawlException {
DocumentWrap document = new DocumentWrap();
try{
// Get the body from the WORD
String htmlBody = FileUtils.readFileToString(file);
Document doc = Jsoup.parse(htmlBody);
if ( uriroot != null )
doc.setBaseUri(uriroot);
setAnchors(doc, uriroot);
// Setup the document
document.setContent( doc.text(), bStoreBody );
document.setSize( (int)file.length() );
document.setType( "text/html" );
document.setId( file.getCanonicalPath() );
if ( uriroot != null )
document.setURL( getUrl( uriroot, file ) );
} catch (FileNotFoundException e) {
throw new CrawlException("File not found: " + file, e);
} catch (IOException e) {
throw new CrawlException("File: " + file, e);
} catch (Exception e) {
throw new CrawlException("File: " + file, e);
}
return document;
}
/**
* Runs around all the internal links and pulls out all the URLs
* @param doc
* @param baseUri
*/
private void setAnchors( Document doc, String baseUri ){
Elements links = doc.select("a[href]");
for (Element link : links) {
if ( baseUri != null )
link.setBaseUri(baseUri);
String newLink = link.attr("abs:href");
if ( newLink.indexOf("#") != -1 )
newLink = newLink.substring( 0, newLink.indexOf("#") );
anchors.add( newLink );
}
}
}