// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.semantic.broadcast; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.index.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.Version; import org.talend.dataquality.semantic.index.DictionarySearcher; class BroadcastUtils { /** * Forbid instantiation */ private BroadcastUtils() { } /** * initialize a list of serializable BroadcastDocumentObject from existing lucene Directory */ static List<BroadcastDocumentObject> readDocumentsFromIndex(Directory indexDir) throws IOException { List<BroadcastDocumentObject> dictionaryObject = new ArrayList<>(); DirectoryReader reader = DirectoryReader.open(indexDir); Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < reader.maxDoc(); i++) { if (liveDocs != null && !liveDocs.get(i)) { continue; } Document doc = reader.document(i); String catId = doc.getField(DictionarySearcher.F_CATID).stringValue(); Set<String> valueSet = new HashSet<String>(); // original values must be read from the F_RAW field for (IndexableField syntermField : doc.getFields(DictionarySearcher.F_RAW)) { valueSet.add(syntermField.stringValue()); } dictionaryObject.add(new BroadcastDocumentObject(catId, valueSet)); } return dictionaryObject; } /** * initialize a list of serializable BroadcastDocumentObject from existing lucene Directory */ static List<BroadcastDocumentObject> readDocumentsFromIndex(Directory indexDir, Set<String> catNames) throws IOException { List<BroadcastDocumentObject> dictionaryObject = new ArrayList<>(); DirectoryReader reader = DirectoryReader.open(indexDir); Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < reader.maxDoc(); i++) { if (liveDocs != null && !liveDocs.get(i)) { continue; } Document doc = reader.document(i); String category = doc.getField(DictionarySearcher.F_WORD).stringValue(); if (catNames.contains(category)) { String catId = doc.getField(DictionarySearcher.F_CATID).stringValue(); Set<String> valueSet = new HashSet<String>(); // original values must be read from the F_RAW field for (IndexableField syntermField : doc.getFields(DictionarySearcher.F_RAW)) { valueSet.add(syntermField.stringValue()); } dictionaryObject.add(new BroadcastDocumentObject(catId, valueSet)); } } return dictionaryObject; } /** * create a lucene RAMDirectory from a list of BroadcastDocumentObject */ static Directory createRamDirectoryFromDocuments(List<BroadcastDocumentObject> dictionaryObject) throws IOException { RAMDirectory ramDirectory = new RAMDirectory(); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LATEST, new StandardAnalyzer(CharArraySet.EMPTY_SET)); IndexWriter writer = new IndexWriter(ramDirectory, writerConfig); for (BroadcastDocumentObject objectDoc : dictionaryObject) { writer.addDocument(BroadcastUtils.createLuceneDocumentFromObject(objectDoc)); } writer.commit(); writer.close(); return ramDirectory; } private static Document createLuceneDocumentFromObject(BroadcastDocumentObject objectDoc) throws IOException { Document indexDoc = new Document(); FieldType ftSyn = new FieldType(); ftSyn.setStored(false); ftSyn.setIndexed(true); ftSyn.setOmitNorms(true); ftSyn.freeze(); indexDoc.add(new StringField(DictionarySearcher.F_CATID, objectDoc.getCategory(), Field.Store.YES)); for (String value : objectDoc.getValueSet()) { // no need to include the field F_RAW during recreation of directory indexDoc.add(new StringField(DictionarySearcher.F_SYNTERM, DictionarySearcher.getJointTokens(value), Field.Store.NO)); } return indexDoc; } }