/* * Copyright 2007 T-Rank AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.trank.openpipe.wikipedia.producer; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.Iterator; import java.util.NoSuchElementException; import javax.xml.stream.XMLStreamException; import org.apache.tools.bzip2.CBZip2InputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import no.trank.openpipe.api.document.Document; import no.trank.openpipe.api.document.DocumentProducer; import no.trank.openpipe.util.Iterators; import no.trank.openpipe.wikipedia.WikipediaDumpHandler; /** * Produces documents from a mediawiki dump. * * @version $Revision$ */ public class WikipediaDocumentProducer implements DocumentProducer { private static final Logger log = LoggerFactory.getLogger(WikipediaDocumentProducer.class); private WikipediaDumpHandler dumpHandler; private WikiDocumentSplitter documentSplitter; private int maxDocs = -1; private String contentField = "wikiPage"; private boolean indexOnlyNew = true; @Override public void init() { if (dumpHandler.isNewDump() || !indexOnlyNew) { final File file = dumpHandler.getDumpFile(); try { FileInputStream in = new FileInputStream(file); log.debug("Opening wikipedia dump at: {}", file.getAbsolutePath()); if (isBunzip2(file)) { // Have to strip away the two first bytes in the .bz2 file if they are 'BZ'. A bug in CBZip2InputStream? documentSplitter = new WikiDocumentSplitter(new BufferedInputStream(new CBZip2InputStream( new BufferedInputStream(new InputStreamPrefixStripper(in, new byte[]{(byte) 'B', (byte) 'Z'}))))); } else { documentSplitter = new WikiDocumentSplitter(new BufferedInputStream(in)); } } catch (XMLStreamException e) { throw new RuntimeException("Could not download file", e); } catch (IOException e) { log.error("Could not read file: " + file.getAbsoluteFile(), e); } } } private static boolean isBunzip2(File file) { return file.getName().toLowerCase().endsWith(".bz2"); } @Override public void close() { if (documentSplitter != null) { try { documentSplitter.close(); } catch (Exception e) { // Do nothing } } } @Override public void fail() { if (documentSplitter != null) { try { documentSplitter.close(); } catch (Exception e) { // Do nothing } } } /** * Sets the <tt>WikipediaDumpHandler</tt> that handles the dump-file. * * @param dumpHandler the handler for the dump-file. */ public void setDumpHandler(WikipediaDumpHandler dumpHandler) { this.dumpHandler = dumpHandler; } /** * Gets the maximum number of documents to produce from the dump. * * @return the maximum number of documents to produce from the dump. */ public int getMaxDocs() { return maxDocs; } /** * Sets the maximum number of documents to produce from the dump. * * Default is -1. All documents in the dump will be produced. * * @param maxDocs the maximum number of documents to produce from the dump. */ public void setMaxDocs(int maxDocs) { this.maxDocs = maxDocs; } /** * Gets the name of the field the document xml will be inserted into. * * @return the name of the field the document xml will be inserted into. */ public String getContentField() { return contentField; } /** * Sets the name of the field the document xml will be inserted into. * * @param contentField the name of the field the document xml will be inserted into. */ public void setContentField(String contentField) { this.contentField = contentField; } @Override public Iterator<Document> iterator() { if (indexOnlyNew && !dumpHandler.isNewDump()) { log.info("Current wiki dump is up to date. Skipping produce. (Set indexOnlyNew to false to force indexing.)"); return Iterators.emptyIterator(); } return new WikiDocumentIterator(maxDocs); } /** * Specifies if the documentProducer should produce documents from an earlier downloaded wikipedia. Set this to false * if you want to produce documents from an earlier downloaded wiki. If this is set to true(default) the producer will * only produce if there was a new dump available. * * @return <code>true</code> if the documentProducer should produce documents from an earlier downloaded wikipedia. */ public boolean isIndexOnlyNew() { return indexOnlyNew; } /** * Specifies if the documentProducer should produce documents from an earlier downloaded wikipedia. Set this to false * if you want to produce documents from an earlier downloaded wiki. If thei is set to true(default) the producer will * only produce if there was a new dump available. * * @param indexOnlyNew <code>true</code> if the documentProducer should produce documents from an earlier downloaded wikipedia. */ public void setIndexOnlyNew(boolean indexOnlyNew) { this.indexOnlyNew = indexOnlyNew; } private class WikiDocumentIterator implements Iterator<Document> { private final int maxDocs; private int processedDocs = 0; private WikiDocumentIterator(int maxDocs) { this.maxDocs = maxDocs; } @Override public boolean hasNext() { return (maxDocs < 0 || maxDocs > processedDocs) && documentSplitter.hasNext(); } @Override public Document next() { if (!hasNext()) { throw new NoSuchElementException(); } final Document doc = new Document(); doc.addFieldValue(contentField, documentSplitter.next()); processedDocs++; return doc; } @Override public void remove() { throw new UnsupportedOperationException("Remove not supported"); } } }