package word2vec.corpuscreation;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
public class CreateDBpediaEdgeList {
public static final String INFOBOXPROPERTIES = "/home/zwicklbauer/HDTGeneration/infobox_properties_en.nt";
public static final String MAPPINGPROPERTIES = "/home/zwicklbauer/HDTGeneration/mappingbased_properties_cleaned_en.nt";
public static final String EDGELIST = "/home/zwicklbauer/dbpediagraph/deepwalk/dbpedia_edgelist.dat";
public static final String INVERTEDLST = "/home/zwicklbauer/dbpediagraph/deepwalk/invertedEntityIdList.dat";
private Model infoboxes = null;
private Model mapping = null;
private HashSet<String> hashnames;
private HashMap<Integer, String> hashinverted;
private HashMap<String, Integer> hash;
private PrintWriter writer;
public CreateDBpediaEdgeList() {
super();
try {
this.writer = new PrintWriter(EDGELIST);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public void createDBpediaGraph() {
System.out.println("Generate Ids");
generateIDHashMap();
System.out.println("Add Facts");
parseFacts();
writer.flush();
writer.close();
}
private void generateIDHashMap() {
System.out.println("Read Model");
this.hashnames = new HashSet<String>();
infoboxes = ModelFactory.createDefaultModel();
infoboxes.read(INFOBOXPROPERTIES);
StmtIterator it = infoboxes.listStatements();
System.out.println("Finished Model");
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
RDFNode object = s.getObject();
this.hashnames.add(subject.getURI());
if (object.isResource()) {
Resource obj = object.asResource();
if(obj.getURI().startsWith("http://dbpedia.org/resource/")) {
this.hashnames.add(obj.getURI());
}
}
}
System.out.println("Read Model");
mapping = ModelFactory.createDefaultModel();
mapping.read(MAPPINGPROPERTIES);
it = mapping.listStatements();
System.out.println("Finished Model");
while (it.hasNext()) {
Statement s = it.next();
Resource subject = s.getSubject();
RDFNode object = s.getObject();
this.hashnames.add(subject.getURI());
if (object.isResource()) {
Resource obj = object.asResource();
if(obj.getURI().startsWith("http://dbpedia.org/resource/")) {
this.hashnames.add(obj.getURI());
}
}
}
// Generate int ids
this.hashinverted = new HashMap<Integer, String>();
this.hash = new HashMap<String, Integer>();
int counter = 0;
for(String s : hashnames) {
if(counter % 10000==0) {
System.out.println(s);
System.out.println("ID counter: "+counter);
}
hashinverted.put(counter, s);
hash.put(s, counter);
counter++;
}
}
private void parseFacts() {
StmtIterator it = infoboxes.listStatements();
int counter = 0;
while (it.hasNext()) {
if (counter % 10000 == 0) {
System.out.println(counter);
}
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource()
&& obj.getURI().startsWith(
"http://dbpedia.org/resource/")) {
writer.println(hash.get(subject.getURI())+"\t"+hash.get(obj.getURI()));
if (counter % 10000 == 0) {
System.out.println(subject.getURI() + " "+obj.getURI());
}
writer.flush();
}
}
counter++;
}
it = mapping.listStatements();
counter = 0;
while (it.hasNext()) {
if (counter % 10000 == 0) {
System.out.println(counter);
}
Statement s = it.next();
Resource subject = s.getSubject();
Property pra = s.getPredicate();
RDFNode object = s.getObject();
if (object.isResource()) {
Resource obj = object.asResource();
if (pra.isResource()
&& obj.getURI().startsWith(
"http://dbpedia.org/resource/")) {
writer.println(hash.get(subject.getURI())+"\t"+hash.get(obj.getURI()));
writer.flush();
}
}
counter++;
}
}
public void outputIDUriMapping() {
try {
PrintWriter writer = new PrintWriter(INVERTEDLST);
for(Map.Entry<Integer,String> entry : hashinverted.entrySet()) {
writer.println(entry.getKey() + "\t"+entry.getValue());
}
writer.flush();
writer.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
CreateDBpediaEdgeList model = new CreateDBpediaEdgeList();
model.createDBpediaGraph();
model.outputIDUriMapping();
}
}