package edu.nd.nina.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Logger;
import edu.nd.nina.graph.TypedEdge;
import edu.nd.nina.graph.TypedSimpleGraph;
import edu.nd.nina.structs.Pair;
import edu.nd.nina.types.dblp.Author;
import edu.nd.nina.types.dblp.Paper;
import edu.nd.nina.types.dblp.Venue;
import edu.nd.nina.types.dblp.Year;
public class ArnetMiner {
private static Logger logger = Logger.getLogger(ArnetMiner.class.getName());
/**
* . Within this archive, you will find one plain text file. Each line
* begins with an identifier for the data found on that line, as is
* described at the Arnetminer dataset website:
*
* #* --- paperTitle
* #@ --- Authors
*
* #year ---- Year
*
* #conf --- publication venue
*
* #citation --- number of citations for this paper
*
* #index ---- index id of this paper
*
* #% ---- the id of references of this paper (there are multiple lines,
* with each indicating a reference)
*
* @param dblpGraphFile
*/
public static void loadDBLPGraphFromFile(InputStream is, TypedSimpleGraph tsg){
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String line = "";
Map<Integer, Paper> idxToPaper = new TreeMap<Integer, Paper>();
List<Pair<Integer, Integer>> refs = new ArrayList<Pair<Integer, Integer>>();
Paper paper = null;
List<Author> authors = new ArrayList<Author>();
Year year = null;
Venue v = null;
logger.info("Begin loading ArnetMiner file");
int total = 8300000;
int perc = -1;
int i=0;
try {
while((line = br.readLine() ) != null){
if(line.trim().isEmpty()){
paper = null;
authors = new ArrayList<Author>();
year = null;
v = null;
// start fresh
}else if(line.startsWith("#*")){
if(perc < i++/(float)total * 100f){
perc++;
logger.info(perc + "% loaded");
}
paper = new Paper(line.substring(2));
}else if(line.startsWith("#@")){
String[] as = line.substring(2).split(",");
for (String author : as) {
Author a = new Author(author);
authors.add(a);
}
}else if(line.startsWith("#year")){
year = new Year(Integer.parseInt(line.substring(5)));
}else if(line.startsWith("#conf")){
v = new Venue(line.substring(5));
}else if(line.startsWith("#citation")){
paper.setCitations(line.substring(9));
}else if(line.startsWith("#index")){
paper.setIdx(Integer.parseInt(line.substring(6)));
tsg.addVertex(paper);
for(Author a : authors){
tsg.addVertex(a);
tsg.addEdge(paper, a);
}
tsg.addVertex(year);
tsg.addEdge(paper, year);
tsg.addVertex(v);
tsg.addEdge(paper, v);
idxToPaper.put(paper.getIdx().hashCode(), paper);
}else if(line.startsWith("#%")){
// no self citations allowed
if(paper.getIdx().equals(line.substring(2))) continue;
refs.add(new Pair<Integer, Integer>(paper.getIdx().hashCode(), line.substring(2).hashCode()));
}
}
logger.info("File loaded");
logger.info("Begin computing references");
total = refs.size();
i=0;
perc = -1;
for(Pair<Integer, Integer> ref : refs){
if(i++/(float)total*100f > perc){
perc++;
logger.info(perc + "% references loaded");
}
if(idxToPaper.containsKey(ref.p2)){
Paper x = idxToPaper.get(ref.p1);
Paper y = idxToPaper.get(ref.p2);
if(!x.equals(y)){
logger.fine("Self reference " + x);
tsg.addEdge(x, y);
}
}
}
logger.info("Number vertices " + tsg.vertexSet().size());
logger.info("Number edges " + tsg.edgeSet().size());
logger.info("Graph Loaded");
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args){
File data = new File("./data/dblp/test.txt");
TypedSimpleGraph tsg = new TypedSimpleGraph(TypedEdge.class);
try {
NINALogger.setup();
loadDBLPGraphFromFile(FileHandler.toInputStream(data), tsg);
PrintStatistics.PrintTypedGraphStatTable(tsg, "./data/dblp/testStats", "DBLPTypedGraph");
} catch (IOException e) {
e.printStackTrace();
}
}
}