WikipediaDocumentProducer.java example

Explorer

openpipe-master
- lemmatizer
  - src
    - main
      - java
        no
        trank
        openpipe
        lemmatizer
        LemmatizerStep.java
        model
        LemmaDeSerializer.java
        LemmaSuffix.java
        LemmatizeModel.java
        LemmatizeModelFactory.java
        parser
        Parser.java
        TextParser.java
        util
        TernarySearchTree.java
        TreeEntry.java
        TreeValue.java
        TreeValueFactory.java
    - test
      - java
        no
        trank
        openpipe
        lemmatizer
        model
        LemmaDeSerializerTest.java
        parser
        TextParserTest.java
        util
        TernarySearchTreeTest.java
- openpipe-core
  - src
    - main
      - java
        no
        trank
        openpipe
        MainSpring.java
        api
        BasePipelineExceptionHandler.java
        BasePipelineStep.java
        BaseSubPipeline.java
        DefaultPipelineExceptionHandler.java
        Finishable.java
        LoggingPipelineExceptionListener.java
        MultiInputFieldPipelineStep.java
        MultiInputOutputFieldPipelineStep.java
        MultiPipelineException.java
        Pipeline.java
        PipelineException.java
        PipelineExceptionHandler.java
        PipelineExceptionListener.java
        PipelineFlow.java
        PipelineFlowEnum.java
        PipelineRunner.java
        PipelineStatusCode.java
        PipelineStep.java
        PipelineStepStatus.java
        PipelineStepStatusCode.java
        SubPipeline.java
        document
        AnnotatedField.java
        Annotation.java
        BaseAnnotatedField.java
        BaseAnnotation.java
        BaseResolvedAnnotation.java
        ByteArrayRawData.java
        Document.java
        DocumentOperation.java
        DocumentProducer.java
        DomRawData.java
        FileRawData.java
        PreResolvedAnnotation.java
        RawData.java
        ResolvedAnnotation.java
        package-info.java
        package-info.java
        config
        BeanValidator.java
        annotation
        NotEmpty.java
        NotNull.java
        NullNotEmpty.java
        reader
        FileDocumentReader.java
        MultiXmlDocumentReader.java
        TextFileDocumentReader.java
        step
        AnnotateSentence.java
        AnnotateSpace.java
        AnnotationToField.java
        ChecksumFields.java
        ChopField.java
        ConvertDate.java
        CopyField.java
        Debug.java
        FieldPipelineSelector.java
        HierarchicalSplitter.java
        OperationPipelineSelector.java
        ParseXML.java
        ParseXMLXPath.java
        PipelineSelector.java
        RegexField.java
        RemoveFields.java
        SetField.java
        StripHtml.java
        Uppercase.java
        WriteXML.java
        xml
        DocumentWriter.java
        XMLStreamDocWriter.java
        util
        AcceptAllFileFilter.java
        FilesFirstComparator.java
        HexUtil.java
        IdentityHashSet.java
        Iterators.java
        QNameEditor.java
        RegexFileFilter.java
        log
        DefaultTimedLogger.java
        NoopTimedLogger.java
        TimedLogger.java
        TotalTimedLogger.java
    - test
      - java
        no
        trank
        openpipe
        api
        BaseSubPipelineTest.java
        DefaultPipelineExceptionHandlerTest.java
        PipelineTest.java
        document
        BaseAnnotatedFieldTest.java
        DocumentTest.java
        config
        BeanValidatorTest.java
        reader
        FileDocumentReaderTest.java
        MultiXmlDocumentReaderTest.java
        step
        AnnotationToFieldTest.java
        ChecksumFieldsTest.java
        ChopFieldTest.java
        ConvertDateTest.java
        HierarchicalSplitterTest.java
        OperationPipelineSelectorTest.java
        ParseXMLTest.java
        ParseXMLXPathTest.java
        RegexFieldTest.java
        SetFieldTest.java
        StripHtmlTest.java
        util
        HexUtilTest.java
        RegexFileFilterTest.java
- openpipe-jdbc
  - src
    - main
      - java
        no
        trank
        openpipe
        jdbc
        DocumentMapper.java
        HtmlJdbcStats.java
        JdbcAdmin.java
        JdbcDocumentProducer.java
        JdbcPoller.java
        JdbcStats.java
        MetaDataDocumentMapper.java
        NoopJdbcStats.java
        SimpleJdbcDocumentProducer.java
        store
        IdStateHolder.java
        StateDocumentProducer.java
        StateDocumentStep.java
        StringRowMapper.java
        TableDescription.java
    - test
      - java
        no
        trank
        openpipe
        jdbc
        SimpleJdbcDocumentProducerTest.java
        store
        StateDocumentProducerTest.java
- openpipe-lang
  - src
    - main
      - java
        no
        trank
        openpipe
        lang
        step
        LanguageIdentifier.java
    - test
      - java
        no
        trank
        openpipe
        lang
        step
        LanguageIdentifierTest.java
- openpipe-opennlp
  - src
    - main
      - java
        no
        trank
        openpipe
        opennlp
        io
        InputStreamGISModelReader.java
        step
        ONLPNEDetector.java
        ONLPSentenceDetector.java
        ONLPTokenizer.java
- openpipe-solr
  - src
    - main
      - java
        no
        trank
        openpipe
        solr
        SolrDocumentPostException.java
        SolrHttpDocumentPoster.java
        SolrXmlDocumentWriter.java
        UpdateOptions.java
        analysis
        AnnotationTokenStream.java
        Base64TokenSerializer.java
        TokenAnnotation.java
        TokenSerializer.java
        TokenStreamAnnotation.java
        step
        SolrAnalyzerStep.java
        SolrDocumentProcessor.java
        util
        TokenFilterFactoryFactory.java
        xml
        XmlInputStream.java
    - test
      - java
        no
        trank
        openpipe
        solr
        step
        SolrDocumentProcessorTest.java
        xml
        XmlInputStreamTest.java
- parse
  - src
    - main
      - java
        no
        trank
        openpipe
        parse
        api
        ParseData.java
        Parser.java
        ParserException.java
        ParserResult.java
        ParserResultImpl.java
        PipelineParseData.java
        package-info.java
        step
        DocumentParser.java
        package-info.java
- parse-misc
  - src
    - main
      - java
        no
        trank
        openpipe
        parse
        oo
        OOParser.java
        pdf
        PDFParser.java
        text
        TextDecoder.java
        TextParser.java
        xml
        XMLParser.java
    - test
      - java
        no
        trank
        openpipe
        parse
        oo
        OOParserTest.java
        xml
        XMLParserTest.java
- parse-ms
  - src
    - main
      - java
        no
        trank
        openpipe
        parse
        ms
        ExcelParser.java
        POIUtils.java
        PowerPointParser.java
        WordParser.java
    - test
      - java
        no
        trank
        openpipe
        parse
        ms
        AbstractMsParserTest.java
        ExcelParserTest.java
        PowerPointParserTest.java
        WordParserTest.java
- solr-producer
  - src
    - main
      - java
        no
        trank
        openpipe
        solr
        producer
        SolrDocumentProducer.java
        SolrUpdateServlet.java
        xml
        XmlStreamDocumentReader.java
    - test
      - java
        no
        trank
        openpipe
        solr
        producer
        SolrUpdateServletTest.java
        xml
        XmlStreamDocumentReaderTest.java
- solr-tokenizer
  - src
    - main
      - java
        no
        trank
        openpipe
        solr
        analysis
        BinaryIO.java
        BinaryTokenDeserializer.java
        BinaryTokenDeserializerFactory.java
        io
        Base64InputStream.java
        Base64Output.java
        Base64OutputBuffer.java
        Base64OutputStream.java
        schema
        Base64Type.java
        util
        IOUtil.java
    - test
      - java
        no
        trank
        openpipe
        solr
        analysis
        BinaryIOTest.java
        BinaryTokenDeserializerTest.java
        io
        Base64InputStreamTest.java
        Base64OutputBufferTest.java
        Base64OutputStreamTest.java
        util
        IOUtilTest.java
- tutorial-intranet
  - src
    - main
      - java
        no
        trank
        openpipe
        tutorial
        intranet
        Main.java
    - test
      - java
        TestMainIntranet.java
- wikipedia
  - src
    - main
      - java
        no
        trank
        openpipe
        wikipedia
        WikipediaDumpHandler.java
        download
        DownloadProgressListener.java
        DownloadProgressLogger.java
        DownloadingWikipediaDumpHandler.java
        HttpDownloader.java
        NullProgressListener.java
        producer
        InputStreamPrefixStripper.java
        WikiDocumentSplitter.java
        WikipediaDocumentProducer.java
        meta
        RssMetaParser.java
        step
        WikipediaUrlBuilder.java
    - test
      - java
        no
        trank
        openpipe
        wikipedia
        download
        HttpDownloaderTest.java
        producer
        WikiDocumentSplitterTest.java
        meta
        RssMetaParserTest.java
        step
        WikipediaUrlBuilderTest.java

/*
 * Copyright 2007  T-Rank AS
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package no.trank.openpipe.wikipedia.producer;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import javax.xml.stream.XMLStreamException;

import org.apache.tools.bzip2.CBZip2InputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import no.trank.openpipe.api.document.Document;
import no.trank.openpipe.api.document.DocumentProducer;
import no.trank.openpipe.util.Iterators;
import no.trank.openpipe.wikipedia.WikipediaDumpHandler;

/**
 * Produces documents from a mediawiki dump.
 *
 * @version $Revision$
 */
public class WikipediaDocumentProducer implements DocumentProducer {
   private static final Logger log = LoggerFactory.getLogger(WikipediaDocumentProducer.class);
   private WikipediaDumpHandler dumpHandler;
   private WikiDocumentSplitter documentSplitter;
   private int maxDocs = -1;
   private String contentField = "wikiPage";
   private boolean indexOnlyNew = true;

   @Override
   public void init() {
      if (dumpHandler.isNewDump() || !indexOnlyNew) {
         final File file = dumpHandler.getDumpFile();
         try {
            FileInputStream in = new FileInputStream(file);
            log.debug("Opening wikipedia dump at: {}", file.getAbsolutePath());
            if (isBunzip2(file)) {
               // Have to strip away the two first bytes in the .bz2 file if they are 'BZ'. A bug in CBZip2InputStream?
               documentSplitter = new WikiDocumentSplitter(new BufferedInputStream(new CBZip2InputStream(
                     new BufferedInputStream(new InputStreamPrefixStripper(in, new byte[]{(byte) 'B', (byte) 'Z'})))));
            } else {
               documentSplitter = new WikiDocumentSplitter(new BufferedInputStream(in));
            }
         } catch (XMLStreamException e) {
            throw new RuntimeException("Could not download file", e);
         } catch (IOException e) {
            log.error("Could not read file: " + file.getAbsoluteFile(), e);
         }
      }
   }

   private static boolean isBunzip2(File file) {
      return file.getName().toLowerCase().endsWith(".bz2");
   }

   @Override
   public void close() {
      if (documentSplitter != null) {
         try {
            documentSplitter.close();
         } catch (Exception e) {
            // Do nothing
         }
      }
   }

   @Override
   public void fail() {
      if (documentSplitter != null) {
         try {
            documentSplitter.close();
         } catch (Exception e) {
            // Do nothing
         }
      }
   }

   /**
    * Sets the <tt>WikipediaDumpHandler</tt> that handles the dump-file.
    *
    * @param dumpHandler the handler for the dump-file.
    */
   public void setDumpHandler(WikipediaDumpHandler dumpHandler) {
      this.dumpHandler = dumpHandler;
   }

   /**
    * Gets the maximum number of documents to produce from the dump.
    *
    * @return the maximum number of documents to produce from the dump.
    */
   public int getMaxDocs() {
      return maxDocs;
   }

   /**
    * Sets the maximum number of documents to produce from the dump.
    *
    * Default is -1. All documents in the dump will be produced.
    *
    * @param maxDocs the maximum number of documents to produce from the dump.
    */
   public void setMaxDocs(int maxDocs) {
      this.maxDocs = maxDocs;
   }

   /**
    * Gets the name of the field the document xml will be inserted into.
    *
    * @return the name of the field the document xml will be inserted into.
    */
   public String getContentField() {
      return contentField;
   }

   /**
    * Sets  the name of the field the document xml will be inserted into.
    *
    * @param contentField the name of the field the document xml will be inserted into.
    */
   public void setContentField(String contentField) {
      this.contentField = contentField;
   }

   @Override
   public Iterator<Document> iterator() {
      if (indexOnlyNew && !dumpHandler.isNewDump()) {
         log.info("Current wiki dump is up to date. Skipping produce. (Set indexOnlyNew to false to force indexing.)");
         return Iterators.emptyIterator();
      }
      return new WikiDocumentIterator(maxDocs);
   }

   /**
    * Specifies if the documentProducer should produce documents from an earlier downloaded wikipedia. Set this to false
    * if you want to produce documents from an earlier downloaded wiki. If this is set to true(default) the producer will
    * only produce if there was a new dump available.
    *
    * @return <code>true</code> if the documentProducer should produce documents from an earlier downloaded wikipedia.
    */
   public boolean isIndexOnlyNew() {
      return indexOnlyNew;
   }

   /**
    * Specifies if the documentProducer should produce documents from an earlier downloaded wikipedia. Set this to false
    * if you want to produce documents from an earlier downloaded wiki. If thei is set to true(default) the producer will
    * only produce if there was a new dump available.
    *
    * @param indexOnlyNew <code>true</code> if the documentProducer should produce documents from an earlier downloaded wikipedia.
    */
   public void setIndexOnlyNew(boolean indexOnlyNew) {
      this.indexOnlyNew = indexOnlyNew;
   }

   private class WikiDocumentIterator implements Iterator<Document> {
      private final int maxDocs;
      private int processedDocs = 0;

      private WikiDocumentIterator(int maxDocs) {
         this.maxDocs = maxDocs;
      }

      @Override
      public boolean hasNext() {
         return (maxDocs < 0 || maxDocs > processedDocs) && documentSplitter.hasNext();
      }

      @Override
      public Document next() {
         if (!hasNext()) {
            throw new NoSuchElementException();
         }
         final Document doc = new Document();
         doc.addFieldValue(contentField, documentSplitter.next());
         processedDocs++;
         return doc;
      }

      @Override
      public void remove() {
         throw new UnsupportedOperationException("Remove not supported");
      }
   }
}