/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see <http://www.gnu.org/licenses/>.
**/
package com.jaeksoft.searchlib.parser.htmlParser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.collections.CollectionUtils;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.SearchLibException.XPathNotSupported;
import com.jaeksoft.searchlib.streamlimiter.LimitException;
import com.jaeksoft.searchlib.streamlimiter.StreamLimiter;
import com.jaeksoft.searchlib.util.StringUtils;
public enum HtmlParserEnum {
FirefoxParser("Firefox", FirefoxParser.class),
PhantomJS("PhantomJS", PhantomJSParser.class),
HtmlUnitParser("HtmlUnit", HtmlUnitParser.class),
HtmlUnitJSParser("HtmlUnit (with Javascript)", HtmlUnitJavaScriptParser.class),
HtmlCleanerParser("HtmlCleaner", HtmlCleanerParser.class),
JSoupParser("Jsoup", JSoupParser.class),
NekoHtmlParser("NekoHtml", NekoHtmlParser.class),
StrictXhtmlParser("StrictXml", StrictXhtmlParser.class),
TagSoupParser("TagSoup", TagsoupParser.class),
BestScoreParser("Best score", null);
private final String label;
private final Class<? extends HtmlDocumentProvider> classDef;
private static HtmlParserEnum[] bestScoreOrder = { TagSoupParser, NekoHtmlParser, HtmlCleanerParser, JSoupParser };
private HtmlParserEnum(String label, Class<? extends HtmlDocumentProvider> classDef) {
this.label = label;
this.classDef = classDef;
}
public String getLabel() {
return label;
}
private static HtmlDocumentProvider findBestProvider(String charset, StreamLimiter streamLimiter,
boolean requireXPath) throws LimitException, InstantiationException, IllegalAccessException, IOException,
ParserConfigurationException {
List<Exception> errors = new ArrayList<Exception>();
try {
HtmlDocumentProvider provider = HtmlParserEnum.StrictXhtmlParser.getHtmlParser(charset, streamLimiter,
requireXPath);
if (provider.getRootNode() != null)
return provider;
} catch (Exception e) {
errors.add(e);
}
List<HtmlDocumentProvider> providerList = new ArrayList<HtmlDocumentProvider>(bestScoreOrder.length);
for (HtmlParserEnum htmlParserEnum : bestScoreOrder) {
try {
providerList.add(htmlParserEnum.getHtmlParser(charset, streamLimiter, requireXPath));
} catch (XPathNotSupported e) {
errors.add(e);
} catch (SAXException e) {
errors.add(e);
} catch (SearchLibException e) {
errors.add(e);
}
}
if (CollectionUtils.isEmpty(providerList)) {
Logging.error(StringUtils.fastConcat("No HTML provider found for: " + streamLimiter.getOriginURL()));
for (Exception e : errors)
Logging.error(e);
}
return HtmlDocumentProvider.bestScore(providerList);
}
public HtmlDocumentProvider getHtmlParser(String charset, StreamLimiter streamLimiter, boolean requireXPath)
throws LimitException, IOException, InstantiationException, IllegalAccessException, SAXException,
ParserConfigurationException, SearchLibException {
if (this == BestScoreParser)
return findBestProvider(charset, streamLimiter, requireXPath);
HtmlDocumentProvider htmlParser = classDef.newInstance();
if (requireXPath && !htmlParser.isXPathSupported())
throw new SearchLibException.XPathNotSupported(htmlParser);
htmlParser.init(charset, streamLimiter);
return htmlParser;
}
public static String[] getLabelArray() {
String[] labelArray = new String[values().length];
int i = 0;
for (HtmlParserEnum htmlParserEnum : values())
labelArray[i++] = htmlParserEnum.label;
return labelArray;
}
public static HtmlParserEnum find(String value) {
for (HtmlParserEnum htmlParserEnum : values())
if (htmlParserEnum.name().equalsIgnoreCase(value) || htmlParserEnum.label.equalsIgnoreCase(value))
return htmlParserEnum;
return BestScoreParser;
}
}