package lia.common; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.document.DateTools; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.util.Version; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.Properties; import java.util.Date; import java.util.List; import java.util.ArrayList; import java.text.ParseException; public class CreateTestIndex { public static Document getDocument(String rootDir, File file) throws IOException { Properties props = new Properties(); props.load(new FileInputStream(file)); Document doc = new Document(); // category comes from relative path below the base directory String category = file.getParent().substring(rootDir.length()); //1 category = category.replace(File.separatorChar, '/'); //1 String isbn = props.getProperty("isbn"); //2 String title = props.getProperty("title"); //2 String author = props.getProperty("author"); //2 String url = props.getProperty("url"); //2 String subject = props.getProperty("subject"); //2 String pubmonth = props.getProperty("pubmonth"); //2 System.out.println(title + "\n" + author + "\n" + subject + "\n" + pubmonth + "\n" + category + "\n---------"); doc.add(new Field("isbn", // 3 isbn, // 3 Field.Store.YES, // 3 Field.Index.NOT_ANALYZED)); // 3 doc.add(new Field("category", // 3 category, // 3 Field.Store.YES, // 3 Field.Index.NOT_ANALYZED)); // 3 doc.add(new Field("title", // 3 title, // 3 Field.Store.YES, // 3 Field.Index.ANALYZED, // 3 Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3 doc.add(new Field("title2", // 3 title.toLowerCase(), // 3 Field.Store.YES, // 3 Field.Index.NOT_ANALYZED_NO_NORMS, // 3 Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3 // split multiple authors into unique field instances String[] authors = author.split(","); // 3 for (String a : authors) { // 3 doc.add(new Field("author", // 3 a, // 3 Field.Store.YES, // 3 Field.Index.NOT_ANALYZED, // 3 Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3 } doc.add(new Field("url", // 3 url, // 3 Field.Store.YES, // 3 Field.Index.NOT_ANALYZED_NO_NORMS)); // 3 doc.add(new Field("subject", // 3 //4 subject, // 3 //4 Field.Store.YES, // 3 //4 Field.Index.ANALYZED, // 3 //4 Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3 //4 doc.add(new NumericField("pubmonth", // 3 Field.Store.YES, // 3 true).setIntValue(Integer.parseInt(pubmonth))); // 3 Date d; // 3 try { // 3 d = DateTools.stringToDate(pubmonth); // 3 } catch (ParseException pe) { // 3 throw new RuntimeException(pe); // 3 } // 3 doc.add(new NumericField("pubmonthAsDay") // 3 .setIntValue((int) (d.getTime()/(1000*3600*24)))); // 3 for(String text : new String[] {title, subject, author, category}) { // 3 // 5 doc.add(new Field("contents", text, // 3 // 5 Field.Store.NO, Field.Index.ANALYZED, // 3 // 5 Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3 // 5 } return doc; } private static String aggregate(String[] strings) { StringBuilder buffer = new StringBuilder(); for (int i = 0; i < strings.length; i++) { buffer.append(strings[i]); buffer.append(" "); } return buffer.toString(); } private static void findFiles(List<File> result, File dir) { for(File file : dir.listFiles()) { if (file.getName().endsWith(".properties")) { result.add(file); } else if (file.isDirectory()) { findFiles(result, file); } } } private static class MyStandardAnalyzer extends StandardAnalyzer { // 6 public MyStandardAnalyzer(Version matchVersion) { // 6 super(matchVersion); // 6 } // 6 public int getPositionIncrementGap(String field) { // 6 if (field.equals("contents")) { // 6 return 100; // 6 } else { // 6 return 0; // 6 } } } public static void main(String[] args) throws IOException { String dataDir = args[0]; String indexDir = args[1]; List<File> results = new ArrayList<File>(); findFiles(results, new File(dataDir)); System.out.println(results.size() + " books to index"); Directory dir = FSDirectory.open(new File(indexDir)); IndexWriter w = new IndexWriter(dir, new MyStandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); for(File file : results) { Document doc = getDocument(dataDir, file); w.addDocument(doc); } w.close(); dir.close(); } } /* #1 Get category #2 Pull fields #3 Add fields to Document instance #4 Flag subject field #5 Add catch-all contents field #6 Custom analyzer to override multi-valued position increment */