/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse.zip; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.util.Properties; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; import org.apache.hadoop.conf.Configuration; /** * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter. * Nutch parse plugin for zip files - Content Type : application/zip * * @author Rohit Kulkarni & Ashish Vaidya */ public class ZipParser implements Parser { private static final Log LOG = LogFactory.getLog(ZipParser.class); private Configuration conf; /** Creates a new instance of ZipParser */ public ZipParser() { } public ParseResult getParse(final Content content) { String resultText = null; String resultTitle = null; Outlink[] outlinks = null; List outLinksList = new ArrayList(); Properties properties = null; try { final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH); final int len = Integer.parseInt(contentLen); if (LOG.isDebugEnabled()) { LOG.debug("ziplen: " + len); } final byte[] contentInBytes = content.getContent(); final ByteArrayInputStream bainput = new ByteArrayInputStream( contentInBytes); final InputStream input = bainput; if (contentLen != null && contentInBytes.length != len) { return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + contentInBytes.length + " bytes. Parser can't handle incomplete pdf file.") .getEmptyParseResult(content.getUrl(), getConf()); } ZipTextExtractor extractor = new ZipTextExtractor(getConf()); // extract text resultText = extractor.extractText(new ByteArrayInputStream( contentInBytes), content.getUrl(), outLinksList); } catch (Exception e) { return new ParseStatus(ParseStatus.FAILED, "Can't be handled as Zip document. " + e).getEmptyParseResult(content.getUrl(), getConf()); } if (resultText == null) { resultText = ""; } if (resultTitle == null) { resultTitle = ""; } outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]); final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, resultTitle, outlinks, content.getMetadata()); if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); } return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData)); } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return this.conf; } }