package doser.tools.indexcreation; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import doser.lucene.analysis.DoserIDAnalyzer; public class CreateBiomedicalDomainIndex { private String oldIndexPath = "/home/quh/Arbeitsfläche/MMapLuceneIndexStandard/"; private String newIndexPath = "/home/quh/Arbeitsfläche/BiomedicalIndex"; CreateBiomedicalDomainIndex() { super(); } private void readOldIndex() { File oldIndexFile = new File(oldIndexPath); File newIndexFile = new File(newIndexPath); IndexReader readerOldIndex = null; IndexWriter newIndexWriter = null; try { final Directory newDir = FSDirectory.open(newIndexFile); Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>(); analyzerPerField.put("Label", new DoserIDAnalyzer()); analyzerPerField.put("Occurrences", new DoserIDAnalyzer()); analyzerPerField.put("Type", new DoserIDAnalyzer()); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField); final IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, aWrapper); newIndexWriter = new IndexWriter(newDir, config); final Directory oldDir = FSDirectory.open(oldIndexFile); readerOldIndex = DirectoryReader.open(oldDir); for (int j = 0; j < readerOldIndex.maxDoc(); ++j) { Document oldDoc = readerOldIndex.document(j); String link = oldDoc.get("Mainlink"); if (!link.startsWith("http://dbpedia.org/resource/")) { Document doc = new Document(); doc.add(new StringField("Mainlink", oldDoc.get("Mainlink"), Store.YES)); doc.add(new TextField("LongDescription", oldDoc.get("Description"), Store.YES)); doc.add(new StringField("Occurrences", oldDoc.get("Occurrences"), Store.YES)); doc.add(new StringField("ID", oldDoc.get("ID"), Store.YES)); doc.add(new TextField("Label", oldDoc.get("Label").toLowerCase(), Store.YES)); // Generate UniqueLabelStrings HashSet<String> uniqueLabelStrings = new HashSet<String>(); uniqueLabelStrings.add(oldDoc.get("Label").toLowerCase()); String s = oldDoc.get("Occurrences"); String[] splitter1 = s.split(";;;"); for (int i = 0; i < splitter1.length; i++) { String[] splitter2 = splitter1[i].split(":::"); for (int k = 0; k < splitter2.length; k++) { uniqueLabelStrings.add(splitter2[0]); } } for (String uniqueString : uniqueLabelStrings) { doc.add(new StringField("UniqueLabel", uniqueString, Store.YES)); } newIndexWriter.addDocument(doc); } } readerOldIndex.close(); newIndexWriter.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (readerOldIndex != null) { try { readerOldIndex.close(); } catch (IOException e) { e.printStackTrace(); } } if (newIndexWriter != null) { try { newIndexWriter.close(); } catch (IOException e) { e.printStackTrace(); } } } } public static void main(String[] args) { CreateBiomedicalDomainIndex indexCreation = new CreateBiomedicalDomainIndex(); indexCreation.readOldIndex(); } }