package de.dfki.km.leech.parser; import java.io.InputStream; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.Scanner; import java.util.Set; import java.util.logging.Logger; import javax.mail.URLName; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; import de.dfki.inquisition.collections.MultiValueHashMap; import de.dfki.km.leech.Leech; import de.dfki.km.leech.io.URLStreamProvider; public class UrlListCrawlerParser extends CrawlerParser { private static final long serialVersionUID = -1061129792080490892L; protected Leech m_leech; @Override protected Iterator<MultiValueHashMap<String, Object>> getSubDataEntitiesInformation(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws Exception { LinkedList<MultiValueHashMap<String, Object>> llSubEntityData = new LinkedList<>(); try (Scanner s = new Scanner(stream, "UTF-8").useDelimiter("\n")) { while (s.hasNext()) { String strUrl = s.next(); if(strUrl.startsWith("//")) continue; if(!strUrl.contains("://") && !strUrl.startsWith("file:")) strUrl = "file:" + strUrl; MultiValueHashMap<String, Object> hsData4Entity = new MultiValueHashMap<>(); hsData4Entity.add("url", strUrl); llSubEntityData.add(hsData4Entity); } } return llSubEntityData.iterator(); } @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return Collections.singleton(new MediaType("application", "leechUrlList")); } @Override protected void processCurrentDataEntity(InputStream stream, Metadata metadata, ContentHandler handler, ParseContext context) throws Exception { // NOP } @Override protected void processSubDataEntity(MultiValueHashMap<String, Object> subDataEntityInformation, Metadata metadata2use4recursiveCall, ContentHandler handler2use4recursiveCall, ParseContext context) throws Exception { String strUrl = (String) subDataEntityInformation.getFirst("url"); URLName url = new URLName(strUrl); metadata2use4recursiveCall = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata2use4recursiveCall, context); InputStream stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata2use4recursiveCall, context); try { if(m_leech == null) m_leech = new Leech(); Logger.getLogger(UrlListCrawlerParser.class.getName()).info("Will crawl " + strUrl); Parser parser = m_leech.getParser(); parser.parse(stream, handler2use4recursiveCall, metadata2use4recursiveCall, context); } finally { if(stream != null) stream.close(); } } }