package org.aksw.gerbil.tools; import java.io.File; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.Set; import org.aksw.gerbil.dataset.check.index.Indexer; import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.semantic.sameas.impl.UriEncodingHandlingSameAsRetriever; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang.time.DurationFormatUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This tool can be used to create the Lucene index that can be used for entity * checking. A file can be used as source for the data, e.g., the mapping from * DBpedia resource to Wikipedia ID. * * @author Michael Röder (roeder@informatik.uni-leipzig.de) * */ public class DBpediaEntityCheckIndexTool { private static final Logger LOGGER = LoggerFactory.getLogger(DBpediaEntityCheckIndexTool.class); private static final String INPUT_FOLDER = "C:/Daten/DBpedia"; private static final String OUTPUT_FOLDER = "indexes/dbpedia_check"; public static void main(String[] args) throws GerbilException, IOException { Indexer index = Indexer.create(OUTPUT_FOLDER); SimpleDateFormat format = new SimpleDateFormat(); Date start = Calendar.getInstance().getTime(); LOGGER.info("Start indexing at {}", format.format(start)); indexFolder(index, INPUT_FOLDER); index.close(); Date end = Calendar.getInstance().getTime(); LOGGER.info("Indexing finished at {}", format.format(end)); LOGGER.info("Indexing took: " + DurationFormatUtils.formatDurationHMS(end.getTime() - start.getTime())); } public static void indexFolder(Indexer index, String folder) { File dir = new File(folder); for (File f : dir.listFiles()) { if (f.getName().endsWith(".ttl")) { index(index, f.getAbsolutePath()); } } } public static void index(Indexer indexer, String file) { UriEncodingHandlingSameAsRetriever retriever = new UriEncodingHandlingSameAsRetriever(); LineIterator iterator = null; long size = 0, rounds = 0; try { iterator = FileUtils.lineIterator(new File(file), "UTF-8"); String uri = null; Set<String> uris; String old = null; Date start = Calendar.getInstance().getTime(); // iterate over the lines while (iterator.hasNext()) { String[] split = iterator.next().split("\\s+"); if (split.length > 2) { // get the subject of the triple uri = split[0]; if (uri.startsWith("<")) { uri = uri.substring(1); } if (uri.endsWith(">")) { uri = uri.substring(0, uri.length() - 1); } // if this subject is new if (!uri.equals(old)) { // retrieve other writings of this URI uris = retriever.retrieveSameURIs(uri); if (uris != null) { for (String u : uris) { indexer.index(u); } } else { indexer.index(uri); } } size++; if (size % 100000 == 0) { Date end = Calendar.getInstance().getTime(); rounds++; String avgTime = DurationFormatUtils .formatDurationHMS((end.getTime() - start.getTime()) / rounds); LOGGER.info("Got 100000 entities...(Sum: {}, AvgTime: {})", size, avgTime); } } } } catch (IOException e) { LOGGER.error("Exception while reading file. It will be ignored.", e); } finally { LineIterator.closeQuietly(iterator); } LOGGER.info("Successfully indexed {} triples", size); } }