package org.aksw.gerbil.tools; import java.io.BufferedOutputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.util.List; import org.aksw.gerbil.dataset.Dataset; import org.aksw.gerbil.dataset.DatasetConfiguration; import org.aksw.gerbil.dataset.check.EntityCheckerManager; import org.aksw.gerbil.dataset.check.impl.EntityCheckerManagerImpl; import org.aksw.gerbil.datatypes.ExperimentType; import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.semantic.sameas.SameAsRetriever; import org.aksw.gerbil.semantic.sameas.impl.ErrorFixingSameAsRetriever; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.MeaningSpan; import org.aksw.gerbil.web.config.AdapterList; import org.aksw.gerbil.web.config.DatasetsConfig; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class UriExport { private static final Logger LOGGER = LoggerFactory.getLogger(UriExport.class); private static final SameAsRetriever SAME_AS_RETRIEVER = new ErrorFixingSameAsRetriever(); private static final EntityCheckerManager ENTITY_CHECKER_MANAGER = new EntityCheckerManagerImpl(); public static void main(String[] args) { PrintStream pout = null; try { pout = new PrintStream(new BufferedOutputStream(new FileOutputStream("exportedURIs.txt"))); AdapterList<DatasetConfiguration> adapterList = DatasetsConfig.datasets(ENTITY_CHECKER_MANAGER, SAME_AS_RETRIEVER); List<DatasetConfiguration> datasetConfigs = null; datasetConfigs = adapterList.getAdaptersForExperiment(ExperimentType.D2KB); for (DatasetConfiguration datasetConfig : datasetConfigs) { try { Dataset dataset = datasetConfig.getDataset(ExperimentType.D2KB); printDatasetUris(dataset, pout); LOGGER.info("Finished {}", dataset.getName()); } catch (GerbilException e) { LOGGER.error("Couldn't load dataset. It will be ignored.", e); } } } catch (IOException e) { LOGGER.error("Error while writing file. Aborting.", e); } finally { IOUtils.closeQuietly(pout); } } private static void printDatasetUris(Dataset dataset, PrintStream pout) { for (Document document : dataset.getInstances()) { String text = document.getText(); for (MeaningSpan meaning : document.getMarkings(MeaningSpan.class)) { for (String uri : meaning.getUris()) { pout.print(uri); pout.print('\t'); pout.print(text.substring(meaning.getStartPosition(), meaning.getStartPosition() + meaning.getLength())); pout.println(); } } } } }