//------------------------------------------------------------------------------ // Copyright (c) 2005, 2007 IBM Corporation and others. // All rights reserved. This program and the accompanying materials // are made available under the terms of the Eclipse Public License v1.0 // which accompanies this distribution, and is available at // http://www.eclipse.org/legal/epl-v10.html // // Contributors: // IBM Corporation - initial implementation //------------------------------------------------------------------------------ package org.eclipse.epf.search; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.util.ArrayList; import java.util.Date; import java.util.Enumeration; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Properties; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.FSDirectory; import org.eclipse.epf.search.analysis.TextAnalyzer; import org.eclipse.epf.search.utils.JarCreator; import org.eclipse.epf.search.utils.LHTMLParser; import org.eclipse.epf.search.utils.UNCUtil; /** * This class is the main class that creates the Index from the file * associations in the process layout. */ public class IndexBuilder { static final String VERSION_FILE_NAME = "version.txt"; //$NON-NLS-1$ static final String VERSION_DELIMITER = "*"; //$NON-NLS-1$ /** * Document fields. */ public static final String BRIEF_DESCRIPTION_FIELD = "briefDescription"; //$NON-NLS-1$ public static final String CONTENT_FIELD = "contents"; //$NON-NLS-1$ public static final String ID_FIELD = "id"; //$NON-NLS-1$ public static final String MODIFIED_FIELD = "modified"; //$NON-NLS-1$ public static final String NAME_FIELD = "name"; //$NON-NLS-1$ public static final String ROLE_FIELD = "role"; //$NON-NLS-1$ public static final String SUMMARY_FIELD = "summary"; //$NON-NLS-1$ public static final String TYPE_FIELD = "type"; //$NON-NLS-1$ public static final String URL_FIELD = "url"; //$NON-NLS-1$ public static final String TITLE_FIELD = "title"; //$NON-NLS-1$ public static final String UMA_ELEMENT_TYPE_FIELD = "uma.type"; //$NON-NLS-1$ public static final String GENERAL_CONTENT = "general_content"; //$NON-NLS-1$ // List of UMA elements that should be included in the search index. private static List NO_SEARCHEABLE_UMA_ELEMENTS = new ArrayList(); static { NO_SEARCHEABLE_UMA_ELEMENTS.add("summary"); //$NON-NLS-1$ NO_SEARCHEABLE_UMA_ELEMENTS.add("workproductdescriptor"); //$NON-NLS-1$ NO_SEARCHEABLE_UMA_ELEMENTS.add("taskdescriptor"); //$NON-NLS-1$ NO_SEARCHEABLE_UMA_ELEMENTS.add("roledescriptor"); //$NON-NLS-1$ } // A list of top level directories that should be excluded from the search // index. public static List dirsToSkip = new ArrayList(); public static String pDirectory = null; private StringBuffer indexFolder = null; private String productName = null; private List filesToSkip = new ArrayList(); private File parentFolder = null; public IndexBuilder(String publishDir) { int appletIndex = -1; if (publishDir == null) return; appletIndex = publishDir.indexOf(File.separator + "applet"); //$NON-NLS-1$ pDirectory = UNCUtil.convertFilename((appletIndex > -1) ? publishDir .substring(0, appletIndex + 1) : publishDir); String siteName = pDirectory.replace(File.separatorChar, '/'); parentFolder = new File(pDirectory); int index = siteName.length(); if (siteName.endsWith("/")) { //$NON-NLS-1$ index = index - 1; } int index2 = siteName.lastIndexOf("/", index - 1); //$NON-NLS-1$ productName = siteName.substring(index2 + 1, index); // create the index StringBuffer searchFolder = new StringBuffer(pDirectory); if (!searchFolder.toString().endsWith(File.separator)) { searchFolder.append(File.separator); } searchFolder.append("search"); //$NON-NLS-1$ indexFolder = new StringBuffer(searchFolder.toString()); indexFolder.append(File.separator).append("index"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "applet"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "css"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "ext_help"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "icons"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "images"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "index"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "logs"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "manuals"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "noapplet"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "pages_not_installed"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "process"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "scripts"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "stylesheets"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "xml"); //$NON-NLS-1$ dirsToSkip.add(pDirectory + "search"); //$NON-NLS-1$ filesToSkip.add("_desc.htm"); //$NON-NLS-1$ filesToSkip.add("_wbs.htm"); //$NON-NLS-1$ filesToSkip.add("_tbs.htm"); //$NON-NLS-1$ filesToSkip.add("_wpbs.htm"); //$NON-NLS-1$ } public boolean createIndex(boolean jarIt) throws SearchServiceException { synchronized (IndexBuilder.class) { if (indexFolder == null || pDirectory == null) { throw new IllegalStateException( "Invalid indexFolder or pDirectory"); //$NON-NLS-1$ } boolean jako = false; Locale locale = Locale.getDefault(); String lang = locale.getLanguage(); if (lang.equals(Locale.JAPANESE.getLanguage()) || lang.equals(Locale.KOREA.getLanguage())) { jako = true; } Analyzer analyzer = jako ? new CJKAnalyzer() : new TextAnalyzer(); try { // RAMDirectory ramDir = new RAMDirectory(); IndexWriter fsWriter = new IndexWriter(FSDirectory .getDirectory(indexFolder.toString(), true), analyzer, true); // IndexWriter ramWriter = new IndexWriter(ramDir, // new TextAnalyzer(), true); if ((fsWriter != null)) { // fsWriter.mergeFactor = 1000; // fsWriter.maxMergeDocs = 10000; fsWriter.setMaxFieldLength(1000000); indexDocs(new File(pDirectory), fsWriter); // fsWriter.addIndexes(new Directory[] { ramDir }); fsWriter.optimize(); // ramWriter.close(); fsWriter.close(); } } catch (Exception e) { e.printStackTrace(); } // create the version file. Date today = new Date(); long milliseconds = today.getTime(); if (!jarIt) { try { FileWriter fw = new FileWriter(indexFolder + File.separator + VERSION_FILE_NAME); BufferedWriter bw = new BufferedWriter(fw); bw.write(productName + VERSION_DELIMITER + milliseconds + "\n"); //$NON-NLS-1$ if (analyzer instanceof CJKAnalyzer) { bw.write("CJKAnalyzer" + "\n"); //$NON-NLS-1$ //$NON-NLS-2$ } bw.close(); fw.close(); } catch (IOException ioe) { throw new SearchServiceException( SearchResources.createSearchIndexError); } return true; } // jar up the created index. JarCreator.jarFolder(indexFolder.toString()); System.out.println("index Jarred successfully"); //$NON-NLS-1$ try { // delete the files now that they've been jarred. File indexDir = new File(indexFolder.toString()); File[] files = indexDir.listFiles(); for (int i = 0; i < files.length; i++) { File tempFile = files[i]; if (!tempFile.getName().equals(JarCreator.INDEX_JAR)) { tempFile.delete(); } } // String rupName = publishDir.substring(index); File newIndexJar = new File(indexFolder + File.separator + JarCreator.INDEX_JAR); if (newIndexJar.exists()) { String fileSize = "" + newIndexJar.length(); //$NON-NLS-1$ FileWriter fw = new FileWriter(indexFolder + File.separator + VERSION_FILE_NAME); BufferedWriter bw = new BufferedWriter(fw); bw.write(productName + VERSION_DELIMITER + milliseconds + VERSION_DELIMITER + fileSize + "\n"); //$NON-NLS-1$ if (analyzer instanceof CJKAnalyzer) { bw.write("CJKAnalyzer" + "\n"); //$NON-NLS-1$ //$NON-NLS-2$ } bw.close(); fw.close(); } else { throw new SearchServiceException( SearchResources.createSearchIndexError); } } catch (IOException ioe) { throw new SearchServiceException( SearchResources.createSearchIndexError); } return true; } } /** * Index the actual documents specified by the files and recursively get all * file in the specified folder file * */ private void indexDocs(File file, IndexWriter writer) throws Exception { if (dirsToSkip.contains(file.getAbsolutePath())) { return; } if (file.isFile()) { for (Iterator iter = filesToSkip.iterator(); iter.hasNext();) { String fileToSkip = (String) iter.next(); if (file.getName().indexOf(fileToSkip) > -1) { return; } } } if (file.isDirectory()) { String[] files = file.list(); for (int i = 0; i < files.length; i++) { indexDocs(new File(file, files[i]), writer); } } else if (isHtmlDoc(file)) { if (shouldBeExcluded(file)) { return; } try { Document doc = getHTMLDocument(file); if (doc != null) { writer.addDocument(doc); } } catch (Exception e1) { System.out.println(file.getName()); System.out.println("indexDocs"); //$NON-NLS-1$ e1.printStackTrace(); } } } /** * Checks whether the given file should be excluded from the search index. * * @param file * The file to be verified. * @return <code>true</code> if the given file should be excluded from the * search index. */ private boolean shouldBeExcluded(File file) { String path = file.getParentFile().getAbsolutePath(); if (pDirectory.startsWith(path)) { return true; } return false; } private static boolean isHtmlDoc(File file) { String path = file.getPath(); return path.endsWith(".html") || path.endsWith(".htm"); //$NON-NLS-1$ //$NON-NLS-2$ } private boolean isNoSearchableDocument(Properties metaTags) { String value = metaTags.getProperty(UMA_ELEMENT_TYPE_FIELD); // value == null is treated as general document return (value != null) && NO_SEARCHEABLE_UMA_ELEMENTS.contains(value); } char[] cbuf = new char[1024]; int skipCount = 0; private Document getHTMLDocument(File file) { Document luceneDocument = null; InputStreamReader input = null; Reader reader = null; BufferedReader bufferedReader = null; try { input = new InputStreamReader(new FileInputStream(file), "UTF-8"); //$NON-NLS-1$ LHTMLParser parser = new LHTMLParser(input); reader = parser.getReader(); if ( reader == null ) { return null; } StringBuffer htmlContent = new StringBuffer(""); String line = ""; bufferedReader = new BufferedReader(reader); while((line = bufferedReader.readLine()) != null) { htmlContent.append(line + "\n"); } Properties metaTags = parser.getMetaTags(); if ( isNoSearchableDocument(metaTags) ) { // the LHTMLParser thread will not end if the reader is not processed // causing major resource leak // while ( reader.read(cbuf) > 0 ) { // ; // } //System.out.println( ++skipCount + " file skipped: " + file.getAbsolutePath()); parser = null; return null; } luceneDocument = new Document(); String url = productName + file.getPath().substring(parentFolder.getPath().length()) .replace(File.separatorChar, '/'); //$NON-NLS-1$ luceneDocument.add(Field.UnIndexed(URL_FIELD, url)); // luceneDocument.add(Field.Text(CONTENT_FIELD, reader)); luceneDocument.add(Field.UnStored(CONTENT_FIELD, htmlContent.toString())); String title = parser.getTitle(); if (title != null && title.length() > 0) { // Workaround a Linux specific issue. title = title.replaceAll("\\xa0", " "); //$NON-NLS-1$ //$NON-NLS-2$ luceneDocument.add(Field.Keyword(TITLE_FIELD, title)); } else { return null; } String summary = parser.getSummary(); if (summary.startsWith(title) && summary.length() > title.length()) { luceneDocument.add(Field.Keyword(SUMMARY_FIELD, summary .substring(title.length() + 1))); } else luceneDocument.add(Field.Keyword(SUMMARY_FIELD, parser .getSummary())); for (Enumeration names = metaTags.propertyNames(); names .hasMoreElements();) { String tagName = (String) names.nextElement(); if (tagName != null) { if (tagName.equals(ROLE_FIELD)) { String roleName = metaTags.getProperty(tagName); if (roleName != null) { luceneDocument.add(Field.Text(tagName, roleName)); } } else { String tagValue = metaTags.getProperty(tagName); if (tagValue != null) { luceneDocument.add(Field.Text(tagName, tagValue)); } } } } if (luceneDocument.getField(ROLE_FIELD) == null) { // Default to "na" to support searching for files without // role meta tags. luceneDocument.add(Field.Text(ROLE_FIELD, "NORUPROLE")); //$NON-NLS-1$ } Field umaTypeField = luceneDocument .getField(UMA_ELEMENT_TYPE_FIELD); if (umaTypeField == null) { // Default to general content. luceneDocument.add(Field.Text(UMA_ELEMENT_TYPE_FIELD, GENERAL_CONTENT)); } parser = null; } catch (Exception e) { luceneDocument = null; SearchPlugin.getDefault().getLogger().logError(e); } finally { if (bufferedReader != null) { try { bufferedReader.close(); } catch (Exception e) { } } if (input != null) { try { input.close(); } catch (Exception e) { } } } return luceneDocument; } }