package edu.umn.cs.recsys.cbf; import edu.umn.cs.recsys.dao.ItemTagDAO; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.grouplens.lenskit.knn.item.ModelSize; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.inject.Inject; import javax.inject.Provider; import java.io.IOException; /** * @author <a href="http://www.grouplens.org">GroupLens Research</a> */ public class LuceneModelBuilder implements Provider<LuceneItemItemModel> { private static final Logger logger = LoggerFactory.getLogger(LuceneModelBuilder.class); private final ItemTagDAO dao; private final int modelNeighborCount; @Inject public LuceneModelBuilder(ItemTagDAO dao, @ModelSize int nnbrs) { this.dao = dao; this.modelNeighborCount = nnbrs; } @Override public LuceneItemItemModel get() { Directory dir = new RAMDirectory(); try { writeMovies(dir); } catch (IOException e) { throw new RuntimeException("I/O error writing movie model", e); } return new LuceneItemItemModel(dir, dao, modelNeighborCount); } private void writeMovies(Directory dir) throws IOException { Analyzer analyzer = new EnglishAnalyzer(Version.LUCENE_35); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); try { logger.info("Building Lucene movie model"); for (long movie: dao.getItemIds()) { logger.debug("building model for {}", movie); Document doc = makeMovieDocument(movie); writer.addDocument(doc); } } finally { writer.close(); } } private Document makeMovieDocument(long movieId) { Document doc = new Document(); doc.add(new Field("movie", Long.toString(movieId), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO)); doc.add(new Field("title", dao.getItemTitle(movieId), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); doc.add(new Field("tags", StringUtils.join(dao.getItemTags(movieId), "\n"), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); return doc; } }