package edu.nd.nina.io; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.logging.Logger; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler; import edu.nd.nina.graph.TypedEdge; import edu.nd.nina.graph.TypedSimpleGraph; import edu.nd.nina.types.dblp.Author; import edu.nd.nina.types.dblp.Paper; import edu.nd.nina.types.dblp.Term; import edu.nd.nina.types.dblp.Venue; public class DBLP { private static Logger logger = Logger.getLogger(DBLP.class.getName()); private static void loadDBLPGraphFromFile(InputStream is, TypedSimpleGraph tsg) { try { SAXParserFactory parserFactory = SAXParserFactory.newInstance(); SAXParser parser = parserFactory.newSAXParser(); ConfigHandler handler = new ConfigHandler(tsg); parser.getXMLReader().setFeature( "http://xml.org/sax/features/validation", true); parser.parse(is, handler); } catch (IOException e) { System.out.println("Error reading URI: " + e.getMessage()); } catch (SAXException e) { System.out.println("Error in parsing: " + e.getMessage()); } catch (ParserConfigurationException e) { System.out.println("Error in XML parser configuration: " + e.getMessage()); } } private static class ConfigHandler extends DefaultHandler { TypedSimpleGraph tsg = null; Set<String> paperType = new HashSet<String>(); Set<String> venueType = new HashSet<String>(); public ConfigHandler(TypedSimpleGraph tsg) { this.tsg = tsg; paperType.add("article"); paperType.add("inproceedings"); paperType.add("book"); paperType.add("proceedings"); paperType.add("phdthesis"); paperType.add("mastersthesis"); paperType.add("incollection"); paperType.add("www"); venueType.add("journal"); venueType.add("booktitle"); currentAuthors = new ArrayList<Author>(); currentTerms = new ArrayList<Term>(); } private Locator locator; private Paper current; private Venue currentVenue; private List<Author> currentAuthors; private List<Term> currentTerms; private String Value; private String key; private String recordTag; int total = 2160375; int perc = 0; int i=0; public void setDocumentLocator(Locator locator) { this.locator = locator; } public void startElement(String namespaceURI, String localName, String rawName, Attributes atts) throws SAXException { if(perc > 2) return; if(paperType.contains(rawName)){ if(perc < i++/(float)total * 100f){ perc++; logger.info(perc + "% loaded"); } current = new Paper(""); currentVenue = null; currentAuthors.clear(); currentTerms.clear(); }else if(rawName.equals("www")){ //eat current = new Paper(""); currentVenue = null; currentAuthors.clear(); currentTerms.clear(); }else{ key = rawName; } Value = ""; } public void endElement(String namespaceURI, String localName, String rawName) throws SAXException { if(perc > 2) return; if(paperType.contains(rawName)){ int i=0; while(tsg.containsVertex(current)){ i++; current.setIdx(i); } tsg.addVertex(current); if(currentVenue == null){ currentVenue = new Venue(current.getAttribute("publisher")); } tsg.addVertex(currentVenue); tsg.addEdge(current, currentVenue); for(Author a : currentAuthors){ tsg.addVertex(a); tsg.addEdge(current, a); } for(Term t : currentTerms){ tsg.addVertex(t); tsg.addEdge(current, t); } }else if(venueType.contains(rawName)){ currentVenue = new Venue(Value); }else if(rawName.equalsIgnoreCase("title")){ current.setTitle(Value); String[] s = Value.split("\\W+"); for(String t : s){ currentTerms.add(new Term(t)); } }else if(rawName.equalsIgnoreCase("author")){ currentAuthors.add(new Author(Value)); }else{ current.addAttribute(rawName, Value); } } public void characters(char[] ch, int start, int length) throws SAXException { if(perc > 2) return; Value += new String(ch, start, length); } private void Message(String mode, SAXParseException exception) { System.out.println(mode + " Line: " + exception.getLineNumber() + " URI: " + exception.getSystemId() + "\n" + " Message: " + exception.getMessage()); } public void warning(SAXParseException exception) throws SAXException { Message("**Parsing Warning**\n", exception); throw new SAXException("Warning encountered"); } public void error(SAXParseException exception) throws SAXException { Message("**Parsing Error**\n", exception); throw new SAXException("Error encountered"); } public void fatalError(SAXParseException exception) throws SAXException { Message("**Parsing Fatal Error**\n", exception); throw new SAXException("Fatal Error encountered"); } } public static void main(String[] args) { File data = new File("./data/dblp/dblp.xml.gz"); TypedSimpleGraph tsg = new TypedSimpleGraph(TypedEdge.class); try { NINALogger.setup(); loadDBLPGraphFromFile(FileHandler.toInputStream(data), tsg); //PrintStatistics.PrintTypedGraphStatTable(tsg, // "./data/dblp/testStats", "DBLPTypedGraph"); //PrintStatistics.PrintCrazyCCF(tsg, "./data/dblp/testStats", "DBLPTypedGraph"); //CalculateStatistics.calcAssortativity(tsg, -1); PrintStatistics.PrintCrazyAssortativity(tsg, "./data/dblp/testStats", "DBLPTypedGraph"); } catch (IOException e) { e.printStackTrace(); } } }