/**
* Copyright 2008 - 2009 Pro-Netics S.P.A.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package it.pronetics.madstore.crawler.parser.impl;
import it.pronetics.madstore.crawler.model.Link;
import it.pronetics.madstore.crawler.model.Page;
import it.pronetics.madstore.crawler.parser.Parser;
import it.pronetics.madstore.crawler.parser.filter.LinkFilter;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import org.htmlparser.NodeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link it.pronetics.madstore.crawler.parser.Parser} implementation extracting links and providing
* URI normalization by:
* <ul>
* <li>Removing unneeded sub-paths.</li>
* <li>Removing the URI fragment.</li>
* <li>Removing the URI query string.</li>
* <li>Making the URI absolute.</li>
* </ul>
*
* @author Salvatore Incandela
* @author Sergio Bossa
*/
public class ParserImpl implements Parser {
private static final Logger LOG = LoggerFactory.getLogger(ParserImpl.class);
public Collection<Link> parse(Page page, LinkFilter linkFilter) {
LOG.info("Parsing and extracting links from: {}", page.getLink());
Collection<String> extractedLinks = extractLinks(page);
Collection<Link> parsedLinks = new HashSet<Link>(extractedLinks.size());
for (String link : extractedLinks) {
try {
String normalizedLink = removeFragment(link);
normalizedLink = removeQueryString(normalizedLink);
normalizedLink = makeAbsolute(page.getLink().getLink(), normalizedLink);
Link linkToAdd = new Link(normalizedLink);
if (linkFilter.accept(linkToAdd)) {
parsedLinks.add(linkToAdd);
}
} catch (Exception ex) {
LOG.warn("Error parsing link: {}", link);
LOG.warn(ex.getMessage());
LOG.debug(ex.getMessage(), ex);
}
}
return parsedLinks;
}
private Collection<String> extractLinks(Page page) {
try {
org.htmlparser.Parser htmlParser = new org.htmlparser.Parser(page.getData());
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeList linkNodes = htmlParser.extractAllNodesThatMatch(linkFilter);
Collection<String> links = new ArrayList<String>(linkNodes.size());
for (int i = 0; i < linkNodes.size(); i++) {
String link = ((LinkTag) linkNodes.elementAt(i)).extractLink().trim();
links.add(link);
}
return links;
} catch (Exception ex) {
LOG.warn("Error extracting links from: {}", page.getLink());
LOG.warn(ex.getMessage());
LOG.debug(ex.getMessage(), ex);
return new ArrayList<String>(0);
}
}
private String removeFragment(String url) {
String result = url;
int fragmentIndex = url.indexOf('#');
if (fragmentIndex >= 0) {
result = url.substring(0, fragmentIndex);
}
return result;
}
private String removeQueryString(String url) {
String result = url;
int queryStringIndex = url.indexOf('?');
if (queryStringIndex >= 0) {
result = url.substring(0, queryStringIndex);
}
return result;
}
private String makeAbsolute(String base, String link) throws Exception {
if (link == null || link.equals("")) {
return new URI(base).normalize().toString();
} else {
URI absoluteUri = new URI(base).resolve(link).normalize();
return absoluteUri.toString();
}
}
}