IndexRebuilder.java example

Explorer

xwiki-clams-core-master
- curriki-old
  - gelcplugins
    - src
      - test
        cactus
        org
        gelc
        xwiki
        plugins
        assets
        cactus
        AssetManagerPluginTest.java
        java
        org
        gelc
        xwiki
        plugins
        assets
        AssetManagerPluginTest.java
        framework
        FrameworkManagerPluginTest.java
        mime
        MimeTypePluginTest.java
- plugins
  - asset
    - src
      - main
        java
        org
        curriki
        xwiki
        plugin
        asset
        Asset.java
        AssetException.java
        AssetManager.java
        CollectionSpace.java
        Constants.java
        CurrikiDocument.java
        DefaultAssetManager.java
        Util.java
        attachment
        ArchiveAsset.java
        ArchiveAssetManager.java
        AttachmentAsset.java
        AttachmentAssetManager.java
        AudioAsset.java
        AudioAssetManager.java
        DocumentAsset.java
        DocumentAssetManager.java
        ImageAsset.java
        ImageAssetManager.java
        InteractiveAsset.java
        InteractiveAssetManager.java
        composite
        CollectionCompositeAsset.java
        CompositeAsset.java
        CompositeAssetManager.java
        FolderCompositeAsset.java
        RootCollectionCompositeAsset.java
        external
        ExternalAsset.java
        ExternalAssetManager.java
        VideoAsset.java
        VideoAssetManager.java
        other
        InvalidAsset.java
        ProtectedAsset.java
        UnknownAsset.java
        text
        TextAsset.java
        TextAssetManager.java
  - curriki
    - src
      - main
        java
        org
        curriki
        xwiki
        plugin
        curriki
        CurrikiException.java
        CurrikiPlugin.java
        CurrikiPluginApi.java
  - currikiactivitystream
    - src
      - main
        java
        org
        curriki
        plugin
        activitystream
        impl
        CurrikiActivityStream.java
        plugin
        CurrikiActivityStreamPlugin.java
        CurrikiActivityStreamPluginApi.java
        DocumentationActivityEvent.java
        MemberActivityEvent.java
        MessageActivityEvent.java
        ResourceActivityEvent.java
  - currikispacemanager
    - src
      - main
        java
        org
        curriki
        plugin
        spacemanager
        impl
        CurrikiSpace.java
        CurrikiSpaceManager.java
        CurrikiSpaceManagerExtension.java
        plugin
        CurrikiSpaceManagerPluginApi.java
  - framework
    - src
      - main
        java
        org
        curriki
        xwiki
        plugin
        framework
        CSVImportFilterImpl.java
        DefaultImportFilterImpl.java
        Framework.java
        FrameworkConstant.java
        FrameworkItem.java
        FrameworkManagerPlugin.java
        FrameworkManagerPluginAPI.java
        ImportFilter.java
  - licence
    - src
      - main
        java
        org
        curriki
        xwiki
        plugin
        licence
        Licence.java
        LicenceManagerConstant.java
        LicenceManagerPlugin.java
        LicenceManagerPluginAPI.java
  - lucene
    - src
      - main
        java
        com
        xpn
        xwiki
        plugin
        lucene
        AbstractXWikiRunnable.java
        AttachmentData.java
        DocumentData.java
        IndexData.java
        IndexFields.java
        IndexRebuilder.java
        IndexUpdater.java
        LucenePlugin.java
        LucenePluginApi.java
        ObjectData.java
        SearchResult.java
        SearchResults.java
        TextExtractor.java
        XWikiDocumentQueue.java
        textextraction
        MSExcelTextExtractor.java
        MSPowerPointTextExtractor.java
        MSWordTextExtractor.java
        MimetypeTextExtractor.java
        OpenOfficeTextExtractor.java
        PDFTextExtractor.java
        PlainTextExtractor.java
        XmlTextExtractor.java
        xmlutil
        XmlEncodingDetector.java
        org
        curriki
        xwiki
        plugin
        lucene
        NoStopWordsAnalyzer.java
  - metadata
    - src
      - main
        java
        org
        curriki
        xwiki
        plugin
        metadata
        MetaDataFrameworkPlugin.java
        MetaDataFrameworkPluginAPI.java
  - mimetype
    - src
      - main
        java
        org
        curriki
        xwiki
        plugin
        mimetype
        MimeTypeConstant.java
        MimeTypePlugin.java
        MimeTypePluginAPI.java
  - servlet
    - src
      - main
        java
        org
        curriki
        xwiki
        servlet
        BaseServlet.java
        RestletServlet.java
        restlet
        resource
        BaseResource.java
        DefaultResource.java
        assets
        AssetManagerResource.java
        AssetResource.java
        AssetsResource.java
        ExternalResource.java
        ExternalsResource.java
        MetadataResource.java
        NominateResource.java
        PartnerResource.java
        PublishedResource.java
        SubassetResource.java
        SubassetsResource.java
        TextassetResource.java
        TextassetsResource.java
        UnnominateResource.java
        VideoResource.java
        VideosResource.java
        groups
        GroupCollectionsResource.java
        metadata
        FieldResource.java
        FieldsResource.java
        users
        UserCollectionsResource.java
        UserGroupsResource.java
        UserResource.java
        router
        AssetsRouter.java
        BaseRouter.java
        GroupsRouter.java
        MetadataRouter.java
        ServiceRouter.java
        UsersRouter.java
  - spacemanager
    - src
      - main
        java
        com
        xpn
        xwiki
        plugin
        spacemanager
        api
        Space.java
        SpaceManager.java
        SpaceManagerException.java
        SpaceManagerExtension.java
        SpaceManagers.java
        SpaceUserProfile.java
        impl
        SpaceImpl.java
        SpaceManagerExtensionImpl.java
        SpaceManagerImpl.java
        SpaceUserProfileImpl.java
        plugin
        SpaceApi.java
        SpaceManagerPluginApi.java
- tools
  - appservmonitoring
    - src
      - main
        java
        org
        curriki
        tools
        monitor
        MonitorAllSources.java
        MonitorPageLoadTime.java
        MonitorWebRenderer.java
        MonitoringConstants.java
  - loadtest
    - src
      - main
        java
        org
        curriki
        tools
        loadtest
        Checker.java
        TestClusteringWorksOnTitles.java
        XWikiHttpClient.java
  - loganalyzer
    - src
      - main
        java
        org
        curriki
        tools
        loganalyzer
        LogAnalysisCursor.java
        LogAnalyzer.java
        LogCollector.java
        TestTailer.java
      - test
        java
        UAParserTest.java
        org
        curriki
        tools
        loganalyzer
        MaximizingLogAnalysisCursor.java
        MultipleParallelConsolesTest.java
        ParseAFewTest.java
        TestParseFilenames.java
  - misctools
    - src
      - main
        java
        UploadToWiki.java
        iContactMassiveUpdater.java
        org
        curriki
        tools
        tests
        TryAnOpenIDRequestAtGoogle.java

/*
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package com.xpn.xwiki.plugin.lucene;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.MDC;

import com.xpn.xwiki.XWiki;
import com.xpn.xwiki.XWikiContext;
import com.xpn.xwiki.XWikiException;
import com.xpn.xwiki.doc.XWikiDocument;

/**
 * <p>
 * Handles rebuilding of the whole Lucene Search Index. This involves the following steps:
 * <ul>
 * <li>empty the existing index</li>
 * <li>retrieve the names of all virtual wikis</li>
 * <li>foreach document in each virtual wiki:
 * <ul>
 * <li>index the document</li>
 * <li>get and index all translations of the document</li>
 * <li>get and index all attachments of the document</li>
 * <li>get and index all objects of the document</li>
 * </ul>
 * </li>
 * </ul>
 * The rebuild can be triggered using the {@link LucenePluginApi#rebuildIndex()} method of the
 * {@link LucenePluginApi}. Once a rebuild request is made, a new thread is created, so the
 * requesting script can continue processing, while the rebuilding is done in the background. The
 * actual indexing is done by the IndexUpdater thread, this thread just gathers the data and passes
 * it to the IndexUpdater.
 * </p>
 * <p>
 * As a summary, this plugin:
 * <ul>
 * <li>cleans the Lucene search indexes and re-submits all the contents of all the wikis for
 * indexing</li>
 * <li>without clogging the indexing thread (since 1.2)</li>
 * <li>all in a background thread (since 1.2)</li>
 * <li>making sure that only one rebuild is in progress (since 1.2)</li>
 * </ul>
 * </p>
 * 
 * @version $Id: $
 */
public class IndexRebuilder extends AbstractXWikiRunnable
{
    /** Logging helper. */
    private static final Log LOG = LogFactory.getLog(IndexRebuilder.class);

    /** The actual object/thread that indexes data. */
    private IndexUpdater indexUpdater;

    /** The XWiki context. */
    private XWikiContext context;

    /** Amount of time (milliseconds) to sleep while waiting for the indexing queue to empty. */
    private static int retryInterval = 30000;

    /** Variable used for indicating that a rebuild is already in progress. */
    private boolean rebuildInProgress = false;

    /** sql query to reindex with */
    private String sql = null;

    /** clear index before rebuilding */
    private boolean clearIndex = false;

    /** only index if the page is not in the index */
    private boolean refresh = false;

    /** documents currently being checked **/
    private long tocheck = 0;

    /** documents that had to be refreshed **/
    private List torefresh = new ArrayList();
    
    public IndexRebuilder(IndexUpdater indexUpdater, XWikiContext context)
    {
        this.indexUpdater = indexUpdater;
        if (indexUpdater.needInitialBuild) {
            this.startRebuildIndex(null, false, false, context);
            LOG.info("Launched initial lucene indexing");
        }
    }

    public synchronized int startRebuildIndex(String sql, boolean clearIndex, boolean refresh, XWikiContext context)
    {
        if (rebuildInProgress) {
            LOG.warn("Cannot launch rebuild because a build is in progress");
            return LucenePluginApi.REBUILD_IN_PROGRESS;
        } else {
            this.rebuildInProgress = true;
            this.context = context;
            this.sql = sql;
            this.clearIndex = clearIndex;
            this.refresh = refresh;
            
            Thread indexRebuilderThread = new Thread(this, "Lucene Index Rebuilder");
            // The JVM should be allowed to shutdown while this thread is running
            indexRebuilderThread.setDaemon(true);
            // Client requests are more important than indexing
            indexRebuilderThread.setPriority(3);
            // Finally, start the rebuild in the background
            indexRebuilderThread.start();
            // Too bad that now we can't tell how many items are there to be indexed...
            return 0;
        }
    }

    public void run()
    {
        MDC.put("url", "Lucene index rebuilder thread");
        LOG.debug("Starting lucene index rebuild");
        XWikiContext context = null;
        try {
            // The context must be cloned, as otherwise setDatabase() might affect the response to
            // the current request.
            // TODO This is not a good way to do this; ideally there would be a method that creates
            // a new context and copies only a few needed objects, as some objects are not supposed
            // to be used in 2 different contexts.
            // TODO This seems to work on a simple run:
            // context = new XWikiContext();
            // context.setWiki(this.context.getWiki());
            // context.setEngineContext(this.context.getEngineContext());
            // context.setMode(this.context.getMode());
            // context.setAction(this.context.getAction());
            // context.put("msg", this.context.get("msg"));
            // context.setMainXWiki(this.context.getMainXWiki());
            // context.setURLFactory(this.context.getURLFactory());
            // context.setLanguage(this.context.getLanguage());
            // context.setDatabase(this.context.getDatabase());
            // context.put("org.xwiki.component.manager.ComponentManager", this.context
            // .get("org.xwiki.component.manager.ComponentManager"));
            context = (XWikiContext) this.context.clone();
            this.context = null;
            // For example, we definitely don't want to use the same hibernate session...
            context.remove("hibsession");
            context.remove("hibtransaction");
            // This is also causing seriuos problems, as the same xcontext gets shared between
            // threads and causes the hibernate session to be shared in the end. The vcontext is
            // automatically recreated by the velocity renderer, if it isn't found in the xcontext.
            context.remove("vcontext");

            // Since this is where a new thread is created this is where we need to initialize the Container 
            // ThreadLocal variables and not in the init() method. Otherwise we would simply overwrite the
            // Container values for the main thread...
            initXWikiContainer(context);
            
            // The original request and response should not be used outside the actual request
            // processing thread, as they will be cleaned later by the container.
            context.setRequest(null);
            context.setResponse(null);

            rebuildIndex(context);
        } catch (Exception e) {
            LOG.error("Error in lucene rebuild thread", e);
        } finally {
            rebuildInProgress = false;

            // Cleanup Container component (it has ThreadLocal variables)
            cleanupXWikiContainer(context);

            if (context != null) {
                context.getWiki().getStore().cleanUp(context);
            }
            MDC.remove("url");
        }
        LOG.debug("Lucene index rebuild done");
    }

    /**
     * First empties the index, then fetches all Documents, their translations and their attachments
     * for re-addition to the index.
     * 
     * @param context
     * @return total number of documents and attachments successfully added to the indexer queue, -1
     *         when errors occured.
     */
    private int rebuildIndex(XWikiContext context)
    {
        // only clear index if it is asked
        if (this.clearIndex)
         this.indexUpdater.cleanIndex();

        int retval = 0;
        Collection<String> wikiServers;
        XWiki xwiki = context.getWiki();
        if (xwiki.isVirtualMode()) {
            wikiServers = findWikiServers(context);
            if (LOG.isDebugEnabled()) {
                LOG.debug("found " + wikiServers.size() + " virtual wikis:");
                for (String wikiName : wikiServers) {
                    LOG.debug(wikiName);
                }
            }
        } else {
            // No virtual wiki configuration, just index the wiki the context belongs to
            wikiServers = new ArrayList<String>();
            wikiServers.add(context.getDatabase());
        }

        // Iterate all found virtual wikis
        for (String wikiName : wikiServers) {
            int wikiResult = indexWiki(wikiName, context);
            if (wikiResult > 0) {
                retval += wikiResult;
            }
        }

        return retval;
    }

    public long getPreIndexQueueSize() {
        return tocheck;
    }

    public List getRefreshedDocuments() {
        return torefresh;
    }

    /**
     * Adds the content of a given wiki to the indexUpdater's queue.
     * 
     * @param wikiName
     * @param context
     * @return
     */
    protected int indexWiki(String wikiName, XWikiContext context)
    {
        LOG.info("Reading content of wiki " + wikiName);
        // Number of index entries processed
        int retval = 0;
        XWiki xwiki = context.getWiki();
        String database = context.getDatabase();
        if (refresh) {
            torefresh.clear();
        }

        try {
            context.setDatabase(wikiName);
            Collection<String> docNames = null;
            try {
                docNames = xwiki.getStore().searchDocumentsNames((this.sql!=null) ? this.sql : "", context);
            } catch (XWikiException ex) {
                LOG.warn(String.format(
                    "Error getting document names for wiki [%s]. Internal error is: $s",
                    wikiName, ex.getMessage()));
                return -1;
            }

            // update to check size
            tocheck = (docNames==null) ? 0 : docNames.size();

            for (String docName : docNames) {
                tocheck--;
                
                if (refresh) {
                    // we should check if the page exists in the index
                    if (this.indexUpdater.isIndexed(wikiName, docName))  {
                        if (LOG.isDebugEnabled())
                         LOG.debug("bypassing document " + wikiName + ":" + docName);
                        continue;
                    } else {
                       torefresh.add(wikiName + ":" + docName);
                    }
                }

                if (LOG.isDebugEnabled())
                    LOG.debug("indexing document " + wikiName + ":" + docName);

                XWikiDocument document;
                try {
                    document = xwiki.getDocument(docName, context);
                } catch (XWikiException e2) {
                    LOG.error("error fetching document " + wikiName + ":" + docName, e2);
                    continue;
                }

                if (document != null) {
                    // In order not to load the whole database in memory, we're limiting the number
                    // of documents that are in the processing queue at a moment. We could use a
                    // Bounded Queue in the index updater, but that would generate exceptions in the
                    // rest of the platform, as the index rebuilder could fill the queue, and then a
                    // user trying to save a document would cause an exception. Thus, it is better
                    // to limit the index rebuilder thread only, and not the index updater.
                    while (this.indexUpdater.getQueueSize() > this.indexUpdater.maxQueueSize) {
                        try {
                            // Don't leave any database connections open while sleeping
                            // This shouldn't be needed, but we never know what bugs might be there
                            context.getWiki().getStore().cleanUp(context);
                            Thread.sleep(retryInterval);
                        } catch (InterruptedException e) {
                            return -2;
                        }
                    }
                    this.indexUpdater.add(document, context);
                    retval++;
                    retval += addTranslationsOfDocument(document, context);
                    retval += this.indexUpdater.addAttachmentsOfDocument(document, context);
                    retval += addObjectsOfDocument(document, context);
                } else {
                    if (LOG.isInfoEnabled()) {
                        LOG.info("XWiki delivered null for document name " + wikiName + ":"
                            + docName);
                    }
                }
            }
        } finally {
            context.setDatabase(database);
        }

        return retval;
    }

    /**
     * Getting the content(values of title/category/content/extract properties ) from the
     * XWiki.ArticleClass objects
     */
    private int addObjectsOfDocument(XWikiDocument document, XWikiContext wikiContext)
    {
        int retval = 0;

        if (document.hasElement(XWikiDocument.HAS_OBJECTS)) {
            retval += document.getxWikiObjects().size();
            this.indexUpdater.addObject(document, wikiContext);
        }

        return retval;
    }

    protected int addTranslationsOfDocument(XWikiDocument document, XWikiContext wikiContext)
    {
        int retval = 0;

        List<String> translations;
        try {
            translations = document.getTranslationList(wikiContext);
        } catch (XWikiException e) {
            LOG.error("error getting list of translations from document "
                + document.getFullName(), e);
            e.printStackTrace();
            return 0;
        }

        for (String lang : translations) {
            try {
                this.indexUpdater.add(document.getTranslatedDocument(lang, wikiContext),
                    wikiContext);
                retval++;
            } catch (XWikiException e1) {
                LOG.error("Error getting translated document for document "
                    + document.getFullName() + " and language " + lang, e1);
            }
        }

        return retval;
    }

    private Collection<String> findWikiServers(XWikiContext context)
    {
        List<String> retval = Collections.emptyList();

        try {
            retval = context.getWiki().getVirtualWikisDatabaseNames(context);

            if (!retval.contains(context.getMainXWiki())) {
                retval.add(context.getMainXWiki());
            }
        } catch (Exception e) {
            LOG.error("Error getting list of wiki servers!", e);
        }

        return retval;
    }
}