/** * Parser.java * Copyright 2010 by Michael Peter Christen * First released 27.4.2010 at http://yacy.net * * This file is part of YaCy Content Integration * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file COPYING.LESSER. * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.cider.parser; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cider.document.Charset; import net.yacy.cider.document.DataSource; import net.yacy.cider.document.Extension; import net.yacy.cider.document.MimeType; import net.yacy.cider.document.URI; import net.yacy.cider.parser.idiom.pdfIdiom; import net.yacy.cider.util.FileUtils; import org.apache.log4j.Logger; import com.hp.hpl.jena.rdf.model.Model; public class Parser { private static final Logger log = Logger.getLogger(FileUtils.class.getName()); private static final Map<MimeType, Idiom> mime2parser = new ConcurrentHashMap<MimeType, Idiom>(); private static final Map<Extension, Idiom> ext2parser = new ConcurrentHashMap<Extension, Idiom>(); private static final Map<Extension, MimeType> ext2mime = new ConcurrentHashMap<Extension, MimeType>(); static { initParser(new pdfIdiom()); } public static Set<Idiom> idioms() { Set<Idiom> c = new HashSet<Idiom>(); c.addAll(ext2parser.values()); c.addAll(mime2parser.values()); return c; } private static void initParser(Idiom parser) { MimeType prototypeMime = null; for (MimeType mime: parser.supportedMimeTypes()) { // process the mime types if (prototypeMime == null) prototypeMime = mime; Idiom p0 = mime2parser.get(mime); if (p0 != null) log.error("parser for mime '" + mime + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); mime2parser.put(mime, parser); log.info("Parser for mime type '" + mime + "': " + parser.getName()); } if (prototypeMime != null) for (Extension ext: parser.supportedExtensions()) { MimeType s = ext2mime.get(ext); if (s != null) log.error("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'."); ext2mime.put(ext, prototypeMime); } for (Extension ext: parser.supportedExtensions()) { // process the extensions Idiom p0 = ext2parser.get(ext); if (p0 != null) log.error("parser for extension '" + ext + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); ext2parser.put(ext, parser); log.info("Parser for extension '" + ext + "': " + parser.getName()); } } public static Model parseSource(final DataSource source) throws InterruptedException, ParserException { if (log.isDebugEnabled()) log.debug("Parsing '" + source.getURI().toNormalform(true, true) + "' from DataSource"); List<Idiom> idioms = null; idioms = idiomParser(source.getURI(), source.getMimeType()); return parseSource(source, idioms); } public static Model parseSource(final URI location) throws InterruptedException, ParserException { try { return parseSource(location, mimeOf(location), "UTF-8", location.length(), location.getInputStream()); } catch (IOException e) { throw new ParserException("cannot read file: " + e.getMessage(), location, e); } } public static Model parseSource( final URI location, final MimeType mimeType, final String charset, final File sourceFile ) throws InterruptedException, ParserException { BufferedInputStream sourceStream = null; try { if (log.isDebugEnabled()) log.debug("Parsing '" + location + "' from file"); if (!sourceFile.exists() || !sourceFile.canRead() || sourceFile.length() == 0) { final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2)."; log.info("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; log.error("Unexpected exception in parseSource from File: " + e.getMessage(), e); throw new ParserException("Unexpected exception: " + e.getMessage(), location); } finally { if (sourceStream != null)try { sourceStream.close(); } catch (final Exception ex) {} } } public static Model parseSource( final URI location, MimeType mimeType, final String charset, final long contentLength, final InputStream sourceStream ) throws InterruptedException, ParserException { if (log.isDebugEnabled()) log.debug("Parsing '" + location + "' from stream"); List<Idiom> idioms = null; idioms = idiomParser(location, mimeType); assert !idioms.isEmpty(); // if we do not have more than one parser or the content size is over MaxInt // then we use only one stream-oriented parser. if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { // use a specific stream-oriented parser return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream); } // in case that we know more parsers we first transform the content into a byte[] and use that as base // for a number of different parse attempts. try { return parseSource(new DataSource(location, mimeType, charset, FileUtils.read(sourceStream, (int) contentLength)), idioms); } catch (IOException e) { throw new ParserException(e.getMessage(), location); } } public static Model parseSource( final URI location, String mimeTypeString, final String charset, final byte[] sourceArray ) throws InterruptedException, ParserException { if (log.isDebugEnabled()) log.debug("Parsing '" + location + "' from stream"); MimeType mimeType = MimeType.getMimetype(mimeTypeString); List<Idiom> idioms = null; idioms = idiomParser(location, mimeType); assert !idioms.isEmpty(); return parseSource(new DataSource(location, mimeType, charset, sourceArray), idioms); } private final static Model parseSource( final URI location, MimeType mimeType, Idiom idiom, final String charset, final long contentLength, final InputStream sourceStream ) throws InterruptedException, ParserException { if (log.isDebugEnabled()) log.debug("Parsing '" + location + "' from stream"); final Extension ext = location.getFileExtension(); final String documentCharset = Charset.patchCharsetEncoding(charset); assert idiom != null; if (log.isDebugEnabled()) log.info("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + ((ext == null) ? "null" : ext.toString()) + "'."); try { return idiom.parse(new DataSource(location, mimeType, documentCharset, sourceStream)); } catch (ParserException e) { throw new ParserException("parser failed: " + idiom.getName(), location); } } public final static Model parseSource( final DataSource source, List<Idiom> idioms ) throws InterruptedException, ParserException { if (log.isDebugEnabled()) log.debug("Parsing " + source.getURI()); assert !idioms.isEmpty(); Model doc = null; HashMap<Idiom, ParserException> failedParser = new HashMap<Idiom, ParserException>(); for (Idiom parser: idioms) { try { doc = parser.parse(source); } catch (ParserException e) { failedParser.put(parser, e); //log.warn("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); } if (doc != null) break; } if (doc == null) { if (failedParser.size() == 0) { //log.warn("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException("parsing failed", source.getURI()); } else { String failedParsers = ""; for (Map.Entry<Idiom, ParserException> error: failedParser.entrySet()) { log.warn("tried parser '" + error.getKey().getName() + "' to parse " + source.getURI().toNormalform(true, false) + " but failed: " + error.getValue().getMessage(), error.getValue()); failedParsers += error.getKey().getName() + " "; } throw new ParserException("all parser failed: " + failedParsers, source.getURI()); } } return doc; } /** * check if the parser supports the given content. * @param url * @param mimeType * @return returns null if the content is supported. If the content is not supported, return a error string. */ public static String supports(final URI url, MimeType mimeType) { try { // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. List<Idiom> idioms = idiomParser(url, mimeType); return (idioms == null || idioms.isEmpty()) ? "no parser found" : null; } catch (ParserException e) { // in case that a parser is not available, return a error string describing the problem. return e.getMessage(); } } /** * find a parser for a given url and mime type * because mime types returned by web severs are sometimes wrong, we also compute the mime type again * from the extension that can be extracted from the url path. That means that there are 3 criteria * that can be used to select a parser: * - the given extension * - the given mime type * - the mime type computed from the extension * @param url the given url * @param mimeType the given mime type * @return a list of Idiom parsers that may be appropriate for the given criteria * @throws ParserException */ private static List<Idiom> idiomParser(final URI url, MimeType mimeType) throws ParserException { List<Idiom> idioms = new ArrayList<Idiom>(2); // check extension Extension ext = url.getFileExtension(); Idiom idiom; if (ext != null) { idiom = ext2parser.get(ext); if (idiom != null) idioms.add(idiom); } // check given mime type if (mimeType != null) { idiom = mime2parser.get(mimeType); if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom); } // check mime type computed from extension MimeType mimeType2 = ext2mime.get(ext); if (mimeType2 == null) return idioms; // in this case we are a bit more lazy idiom = mime2parser.get(mimeType2); if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom); // finally check if we found any parser if (idioms.isEmpty()) throw new ParserException("no parser found for extension '" + ext + "' and mime type '" + mimeType.toString() + "'", url); return idioms; } public static String supportsMime(MimeType mimeType) { if (mimeType == null) return "mimeType == null"; if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available"; return null; } public static String supportsExtension(final URI url) { if (url == null) return "url == null"; Extension ext = url.getFileExtension(); if (ext == null) return null; MimeType mimeType = ext2mime.get(ext); if (mimeType == null) return "no parser available"; Idiom idiom = mime2parser.get(mimeType); assert idiom != null; if (idiom == null) return "no parser available (internal error!)"; return null; } public static MimeType mimeOf(URI url) { if (url == null) return null; return mimeOf(url.getFileExtension()); } public static MimeType mimeOf(Extension ext) { if (ext == null) return null; return ext2mime.get(ext); } }