package doser.tools.indexcreation; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.util.HashSet; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; /** * Needs old Lucene Index * * @author quh * */ public class CreateEntityList { public static void main(String[] args) throws IOException { HashSet<String> set = new HashSet<String>(); // Model m = ModelFactory.createDefaultModel(); // m.read("/home/zwicklbauer/HDTGeneration/Dbpedia2014/instance_types_en.nt"); PrintWriter writer = new PrintWriter( "/home/zwicklbauer/entityList_test.dat"); // StmtIterator it = m.listStatements(); // while (it.hasNext()) { // Statement s = it.next(); // Resource subject = s.getSubject(); // set.add(subject.getURI()); // } BufferedReader reader = null; reader = new BufferedReader(new FileReader( "/mnt/storage/zwicklbauer/urls.txt")); String line = null; while ((line = reader.readLine()) != null) { String[] splitter = line.split("\t"); if (splitter.length == 6 || splitter.length == 5) { String uncovertedUri = splitter[2].replaceAll( "http://en.wikipedia.org/wiki/", ""); String convertedUri = WikiPediaUriConverter .createConformDBpediaUrifromEncodedString(uncovertedUri); set.add(convertedUri); } else { System.out.println(line); } } reader.close(); for (String s : set) { writer.println(s); } writer.close(); } }