IndexBuilder.java example

Explorer

EPF-Composer-master
- 1.5
  - plugins
  - tests
    - org.eclipse.epf.common.tests
      - src
        org
        eclipse
        epf
        common
        tests
        CommonTests.java
        EPFVersionTest.java
        EPFVersionsTest.java
        StrUtilTest.java
        VersionUtilTest.java
        XMLUtilTest.java
    - org.eclipse.epf.diagram.tests
      - src
        org
        eclipse
        epf
        diagram
        tests
        DiagramTestCase.java
        DiagramTestPlugin.java
        DiagramTests.java
        DiagramUIServiceTest.java
    - org.eclipse.epf.library.tests
      - src
        org
        eclipse
        epf
        library
        tester
        LibraryDiffAnalyzor.java
        LibraryJunitTestService.java
        LibraryTestService.java
        LibraryTesterFactory.java
        OutputDiffAnalyzor.java
        QaTestService.java
        TestCommandFileTest.java
        TestCommandFileTests.java
        TestFolderSetup.java
        TesterOutputUtil.java
        iface
        ITestFolderSetup.java
        LibraryJunitTest.java
        LibraryTester.java
        TCExeReply.java
        TCExeReplyList.java
        TestCommand.java
        TestTracer.java
        impl
        ExportImportTestImpl.java
        LibraryJunitTestImpl.java
        LibraryTesterImpl.java
        TestCommandImpl.java
        TestCommandMgr.java
        testcommands
        TCCircularDependencyCheck.java
        TCCompareToGoldenFile.java
        TCCompareToLibrary.java
        TCCopyLibrary.java
        TCEditMethodElement.java
        TCEditMethodElementBase.java
        TCExeReplyImpl.java
        TCExeReplyListImpl.java
        TCExportConfiguration.java
        TCExportPlugins.java
        TCExportXml.java
        TCImportConfiguration.java
        TCImportPlugins.java
        TCImportXml.java
        TCNewMethodConfiguration.java
        TCNewMethodElement.java
        TCNewMethodPlugin.java
        TCOpenLibrary.java
        TCOutputMethodElement.java
        tests
        AbstractLibraryTestCase.java
        DependencyCheckerTest.java
        LibraryServiceTest.java
        LibraryTestHelper.java
        LibraryTests.java
        TestsPlugin.java
        exportimport
        ConfigExportImport.java
        ConfigExportImportTest0001.java
        ExportImport.java
        ExportImportTestMethodBase.java
        PluginExportImport.java
        PluginExportImportTest0001.java
        StandAloneTest.java
        StandAloneThreadTest.java
        XmlExportImport.java
        XmlExportImportTest0001.java
        validation
        CircularDependencyCheck.java
        ValidationTestImpl.java
        variability
        ActivityVariabilityTest.java
        AttributeFeatureTest.java
        CopyrightTest.java
        FulfillmentTest.java
        Incoming01FeatureTest.java
        Incoming0nFeatureTest.java
        Outgoing01FeatureTest.java
        Outgoing0nFeatureTest.java
        RegressionTest.java
        VariablityBaseTestCase.java
    - org.eclipse.epf.richtext.tests
      - src
        org
        eclipse
        epf
        richtext
        tests
        RichTextTest.java
        RichTextTests.java
        RichTextTestsPlugin.java
        actions
        BlockTagContribution.java
        BoldAction.java
        FontNameContribution.java
        FontSizeContribution.java
        ItalicAction.java
        RichTextAction.java
        SubscriptAction.java
        UnderlineAction.java
        editors
        HTMLEditor.java
        HTMLEditorActionBarContributor.java
        MultiPageHTMLEditor.java
        RichTextTestEditor.java
        views
        RichTextTestView.java
    - org.eclipse.epf.tests
      - src
        org
        eclipse
        epf
        tests
        AllTests.java
        ui
        UserInteractionHandlerTest.java
    - org.eclipse.epf.toolbox
      - src
        org
        eclipse
        epf
        toolbox
        ToolboxPlugin.java
        actions
        ConvertToConfigFree.java
        DiagramDoctor.java
        RemoveLocalDescriptors.java
        batch
        C.java
        EbcBatchCommandMgr.java
        EbcBatchRunner.java
        EbcExeReplies.java
        EbcExeReply.java
        EbcLibraryService.java
        EbcReportMgr.java
        commands
        EbcExportConfiguration.java
        EbcExportPlugins.java
        EbcExportXml.java
        EbcImportConfiguration.java
        EbcImportPlugins.java
        EbcImportXml.java
        EbcOpenLibrary.java
        EbcReportMethodElement.java
        EpfBatchCommand.java
        EpfBatchCommandImpl.java
        libutil
        LibUtil.java
        utils
        CopyPIIFiles.java
        DebugTest.java
        EditFiles.java
        RenameFiles.java
        XsltTransform.java

//------------------------------------------------------------------------------
// Copyright (c) 2005, 2007 IBM Corporation and others.
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// Contributors:
// IBM Corporation - initial implementation
//------------------------------------------------------------------------------
package org.eclipse.epf.search;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Properties;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.eclipse.epf.search.analysis.TextAnalyzer;
import org.eclipse.epf.search.utils.JarCreator;
import org.eclipse.epf.search.utils.LHTMLParser;
import org.eclipse.epf.search.utils.UNCUtil;

/**
 * This class is the main class that creates the Index from the file
 * associations in the process layout.
 */
public class IndexBuilder {

	static final String VERSION_FILE_NAME = "version.txt"; //$NON-NLS-1$
	static final String VERSION_DELIMITER = "*"; //$NON-NLS-1$

	/**
	 * Document fields.
	 */
	public static final String BRIEF_DESCRIPTION_FIELD = "briefDescription"; //$NON-NLS-1$
	public static final String CONTENT_FIELD = "contents"; //$NON-NLS-1$
	public static final String ID_FIELD = "id"; //$NON-NLS-1$
	public static final String MODIFIED_FIELD = "modified"; //$NON-NLS-1$
	public static final String NAME_FIELD = "name"; //$NON-NLS-1$
	public static final String ROLE_FIELD = "role"; //$NON-NLS-1$
	public static final String SUMMARY_FIELD = "summary"; //$NON-NLS-1$
	public static final String TYPE_FIELD = "type"; //$NON-NLS-1$
	public static final String URL_FIELD = "url"; //$NON-NLS-1$
	public static final String TITLE_FIELD = "title"; //$NON-NLS-1$
	public static final String UMA_ELEMENT_TYPE_FIELD = "uma.type"; //$NON-NLS-1$
	public static final String GENERAL_CONTENT = "general_content"; //$NON-NLS-1$

	// List of UMA elements that should be included in the search index.
	private static List NO_SEARCHEABLE_UMA_ELEMENTS = new ArrayList();
	static {
		NO_SEARCHEABLE_UMA_ELEMENTS.add("summary"); //$NON-NLS-1$
		NO_SEARCHEABLE_UMA_ELEMENTS.add("workproductdescriptor"); //$NON-NLS-1$
		NO_SEARCHEABLE_UMA_ELEMENTS.add("taskdescriptor"); //$NON-NLS-1$
		NO_SEARCHEABLE_UMA_ELEMENTS.add("roledescriptor"); //$NON-NLS-1$
	}

	// A list of top level directories that should be excluded from the search
	// index.
	public static List dirsToSkip = new ArrayList();
	public static String pDirectory = null;
	private StringBuffer indexFolder = null;
	private String productName = null;
	private List filesToSkip = new ArrayList();
	private File parentFolder = null;
	
	public IndexBuilder(String publishDir) {
		int appletIndex = -1;
		if (publishDir == null)
			return;

		appletIndex = publishDir.indexOf(File.separator + "applet"); //$NON-NLS-1$

		pDirectory = UNCUtil.convertFilename((appletIndex > -1) ? publishDir
				.substring(0, appletIndex + 1) : publishDir);
		String siteName = pDirectory.replace(File.separatorChar, '/');
		parentFolder = new File(pDirectory);
		int index = siteName.length();
		if (siteName.endsWith("/")) { //$NON-NLS-1$
			index = index - 1;
		}

		int index2 = siteName.lastIndexOf("/", index - 1); //$NON-NLS-1$

		productName = siteName.substring(index2 + 1, index);

		// create the index
		StringBuffer searchFolder = new StringBuffer(pDirectory);
		if (!searchFolder.toString().endsWith(File.separator)) {
			searchFolder.append(File.separator);
		}
		searchFolder.append("search"); //$NON-NLS-1$

		indexFolder = new StringBuffer(searchFolder.toString());
		indexFolder.append(File.separator).append("index"); //$NON-NLS-1$

		dirsToSkip.add(pDirectory + "applet"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "css"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "ext_help"); //$NON-NLS-1$		
		dirsToSkip.add(pDirectory + "icons"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "images"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "index"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "logs"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "manuals"); //$NON-NLS-1$		
		dirsToSkip.add(pDirectory + "noapplet"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "pages_not_installed"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "process"); //$NON-NLS-1$		
		dirsToSkip.add(pDirectory + "scripts"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "stylesheets"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "xml"); //$NON-NLS-1$
		dirsToSkip.add(pDirectory + "search"); //$NON-NLS-1$

		filesToSkip.add("_desc.htm");  //$NON-NLS-1$
		filesToSkip.add("_wbs.htm"); //$NON-NLS-1$
		filesToSkip.add("_tbs.htm"); //$NON-NLS-1$
		filesToSkip.add("_wpbs.htm"); //$NON-NLS-1$
	}
	
	public boolean createIndex(boolean jarIt) throws SearchServiceException {
		synchronized (IndexBuilder.class) {
			
			if (indexFolder == null || pDirectory == null) {
				throw new IllegalStateException(
						"Invalid indexFolder or pDirectory"); //$NON-NLS-1$
			}

			
			boolean jako = false;
			Locale locale = Locale.getDefault();
			String lang = locale.getLanguage();
			if (lang.equals(Locale.JAPANESE.getLanguage()) ||
				lang.equals(Locale.KOREA.getLanguage())) {
				jako = true;
			}
			Analyzer analyzer = jako ? new CJKAnalyzer() : new TextAnalyzer();
			
			try {
				// RAMDirectory ramDir = new RAMDirectory();
				IndexWriter fsWriter = new IndexWriter(FSDirectory
						.getDirectory(indexFolder.toString(), true),
						analyzer, true);

				// IndexWriter ramWriter = new IndexWriter(ramDir,
				// new TextAnalyzer(), true);

				if ((fsWriter != null)) {
					// fsWriter.mergeFactor = 1000;
					// fsWriter.maxMergeDocs = 10000;
					fsWriter.setMaxFieldLength(1000000);

					indexDocs(new File(pDirectory), fsWriter);

					// fsWriter.addIndexes(new Directory[] { ramDir });
					fsWriter.optimize();
					// ramWriter.close();
					fsWriter.close();
				}
			} catch (Exception e) {
				e.printStackTrace();
			}

			// create the version file.
			Date today = new Date();
			long milliseconds = today.getTime();

			if (!jarIt) {
				try {
					FileWriter fw = new FileWriter(indexFolder + File.separator
							+ VERSION_FILE_NAME);
					BufferedWriter bw = new BufferedWriter(fw);
					bw.write(productName + VERSION_DELIMITER + milliseconds
							+ "\n"); //$NON-NLS-1$
					if (analyzer instanceof CJKAnalyzer) {
						bw.write("CJKAnalyzer" + "\n"); //$NON-NLS-1$	//$NON-NLS-2$
					}
					bw.close();
					fw.close();
				} catch (IOException ioe) {
					throw new SearchServiceException(
							SearchResources.createSearchIndexError);
				}

				return true;
			}

			// jar up the created index.
			JarCreator.jarFolder(indexFolder.toString());

			System.out.println("index Jarred successfully"); //$NON-NLS-1$

			try {
				// delete the files now that they've been jarred.
				File indexDir = new File(indexFolder.toString());
				File[] files = indexDir.listFiles();
				for (int i = 0; i < files.length; i++) {
					File tempFile = files[i];
					if (!tempFile.getName().equals(JarCreator.INDEX_JAR)) {
						tempFile.delete();
					}
				}

				// String rupName = publishDir.substring(index);
				File newIndexJar = new File(indexFolder + File.separator
						+ JarCreator.INDEX_JAR);
				if (newIndexJar.exists()) {
					String fileSize = "" + newIndexJar.length(); //$NON-NLS-1$
					FileWriter fw = new FileWriter(indexFolder + File.separator
							+ VERSION_FILE_NAME);
					BufferedWriter bw = new BufferedWriter(fw);
					bw.write(productName + VERSION_DELIMITER + milliseconds
							+ VERSION_DELIMITER + fileSize + "\n"); //$NON-NLS-1$
					if (analyzer instanceof CJKAnalyzer) {
						bw.write("CJKAnalyzer" + "\n"); //$NON-NLS-1$	//$NON-NLS-2$
					}
					bw.close();
					fw.close();
				} else {
					throw new SearchServiceException(
							SearchResources.createSearchIndexError);
				}
			} catch (IOException ioe) {
				throw new SearchServiceException(
						SearchResources.createSearchIndexError);
			}

			return true;
		}
	}

	/**
	 * Index the actual documents specified by the files and recursively get all
	 * file in the specified folder file
	 * 
	 */
	private void indexDocs(File file, IndexWriter writer) throws Exception {
		if (dirsToSkip.contains(file.getAbsolutePath())) {
			return;
		}
		if (file.isFile()) {
			for (Iterator iter = filesToSkip.iterator(); iter.hasNext();) {
				String fileToSkip = (String) iter.next();
				if (file.getName().indexOf(fileToSkip) > -1) {
					return;
				}
			}
		}

		if (file.isDirectory()) {
			String[] files = file.list();
			for (int i = 0; i < files.length; i++) {
				indexDocs(new File(file, files[i]), writer);
			}
		} else if (isHtmlDoc(file)) {
			if (shouldBeExcluded(file)) {
				return;
			}
			try {
				
				Document doc = getHTMLDocument(file);		
				
				if (doc != null) {
					writer.addDocument(doc);				
				}
				
			} catch (Exception e1) {
				System.out.println(file.getName());
				System.out.println("indexDocs"); //$NON-NLS-1$
				e1.printStackTrace();
			}
		}
	}

	/**
	 * Checks whether the given file should be excluded from the search index.
	 * 
	 * @param file
	 *            The file to be verified.
	 * @return <code>true</code> if the given file should be excluded from the
	 *         search index.
	 */
	private boolean shouldBeExcluded(File file) {
		String path = file.getParentFile().getAbsolutePath();
		if (pDirectory.startsWith(path)) {
			return true;
		}

		return false;
	}

	private static boolean isHtmlDoc(File file) {
		String path = file.getPath();
		return path.endsWith(".html") || path.endsWith(".htm"); //$NON-NLS-1$ //$NON-NLS-2$
	}

	private boolean isNoSearchableDocument(Properties metaTags) {
		String value = metaTags.getProperty(UMA_ELEMENT_TYPE_FIELD);
		
		// value == null is treated as general document
		return (value != null) && NO_SEARCHEABLE_UMA_ELEMENTS.contains(value);
	}
	
	char[] cbuf = new char[1024];
	int skipCount = 0;
	
	private Document getHTMLDocument(File file) {
		Document luceneDocument = null;
		InputStreamReader input = null;
		Reader reader = null;
		BufferedReader bufferedReader = null; 
		try {
			
			input = new InputStreamReader(new FileInputStream(file), "UTF-8"); //$NON-NLS-1$

			LHTMLParser parser = new LHTMLParser(input);

			reader = parser.getReader();
			if ( reader == null ) {
				return null;
			}
			
			StringBuffer htmlContent = new StringBuffer("");
			String line = "";
			bufferedReader = new BufferedReader(reader);
			while((line = bufferedReader.readLine()) != null)
			{
				htmlContent.append(line + "\n");
			}

			Properties metaTags = parser.getMetaTags();
			if ( isNoSearchableDocument(metaTags) ) {
				
				// the LHTMLParser thread will not end if the reader is not processed
				// causing major resource leak
//				while ( reader.read(cbuf) > 0 ) {
//					;
//				}			
				//System.out.println( ++skipCount + " file skipped: " + file.getAbsolutePath());
				parser = null;
				return null;
			}
			
			luceneDocument = new Document();
			
			String url = productName
					+ file.getPath().substring(parentFolder.getPath().length())
							.replace(File.separatorChar, '/'); //$NON-NLS-1$
			luceneDocument.add(Field.UnIndexed(URL_FIELD, url));
			
//			luceneDocument.add(Field.Text(CONTENT_FIELD, reader));
			luceneDocument.add(Field.UnStored(CONTENT_FIELD, htmlContent.toString()));

			String title = parser.getTitle();
			if (title != null && title.length() > 0) {
				// Workaround a Linux specific issue.
				title = title.replaceAll("\\xa0", " "); //$NON-NLS-1$ //$NON-NLS-2$
				luceneDocument.add(Field.Keyword(TITLE_FIELD, title));
			} else {
				return null;
			}

			String summary = parser.getSummary();
			if (summary.startsWith(title) && summary.length() > title.length()) {
				luceneDocument.add(Field.Keyword(SUMMARY_FIELD, summary
						.substring(title.length() + 1)));
			} else
				luceneDocument.add(Field.Keyword(SUMMARY_FIELD, parser
						.getSummary()));

			for (Enumeration names = metaTags.propertyNames(); names
					.hasMoreElements();) {
				String tagName = (String) names.nextElement();
				if (tagName != null) {
					if (tagName.equals(ROLE_FIELD)) {
						String roleName = metaTags.getProperty(tagName);
						if (roleName != null) {
							luceneDocument.add(Field.Text(tagName, roleName));
						}
					} else {
						String tagValue = metaTags.getProperty(tagName);
						if (tagValue != null) {
							luceneDocument.add(Field.Text(tagName, tagValue));
						}
					}
				}
			}

			if (luceneDocument.getField(ROLE_FIELD) == null) {
				// Default to "na" to support searching for files without
				// role meta tags.
				luceneDocument.add(Field.Text(ROLE_FIELD, "NORUPROLE")); //$NON-NLS-1$
			}

			Field umaTypeField = luceneDocument
					.getField(UMA_ELEMENT_TYPE_FIELD);
			if (umaTypeField == null) {
				// Default to general content.
				luceneDocument.add(Field.Text(UMA_ELEMENT_TYPE_FIELD,
						GENERAL_CONTENT));
			} 

			parser = null;
		} catch (Exception e) {
			luceneDocument = null;
			SearchPlugin.getDefault().getLogger().logError(e);
		} finally {
			if (bufferedReader != null) {
				try {
					bufferedReader.close();
				} catch (Exception e) {
				}
			}
			if (input != null) {
				try {
					input.close();
				} catch (Exception e) {
				}
			}
		}

		return luceneDocument;
	}

}