package edu.nd.nina.io; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.logging.Logger; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; import edu.nd.nina.Graphs; import edu.nd.nina.Type; import edu.nd.nina.alg.ConstrainedRandomWalkWithRestart; import edu.nd.nina.alg.MetaPath; import edu.nd.nina.graph.TypedEdge; import edu.nd.nina.graph.TypedSimpleGraph; import edu.nd.nina.math.Moment; import edu.nd.nina.structs.Triple; import edu.nd.nina.types.kddcup2013.Affiliation; import edu.nd.nina.types.kddcup2013.Author; import edu.nd.nina.types.kddcup2013.AuthorAlsoKnownAs; import edu.nd.nina.types.kddcup2013.Paper; import edu.nd.nina.types.kddcup2013.Term; import edu.nd.nina.types.kddcup2013.Venue; import edu.nd.nina.types.kddcup2013.VenueWebPage; import edu.nd.nina.types.kddcup2013.Year; public class KDDCup2013 { private static Logger logger = Logger.getLogger(KDDCup2013.class.getName()); private static String otherThanQuote = " [^\"] "; private static String quotedString = String.format(" \" %s* \" ", otherThanQuote); private static String csvregex = String.format("(?x) " + // enable comments, // ignore white // spaces ", " + // match a comma "(?= " + // start positive look ahead " ( " + // start group 1 " %s* " + // match 'otherThanQuote' zero or // more times " %s " + // match 'quotedString' " )* " + // end group 1 and repeat it zero or // more times " %s* " + // match 'otherThanQuote' " $ " + // match the end of the string ") ", // stop positive look ahead otherThanQuote, quotedString, otherThanQuote); private static File dataFolder = new File("./data/kddcup2013/"); private static File serial = new File(dataFolder, "serial.bin"); private static File serialMap = new File(dataFolder, "serialMap.bin"); private static File valid = new File(dataFolder.getAbsolutePath() + "\\" + "Valid.csv"); private static File train = new File(dataFolder.getAbsolutePath() + "\\" + "Train.csv"); private static Map<Integer, Author> authorMap = new HashMap<Integer, Author>(); private static Map<Integer, Paper> paperMap = new HashMap<Integer, Paper>(); private static Map<Integer, Venue> venueMap = new HashMap<Integer, Venue>(); private static void loadKDDCupGraphFromFolder(File dataFolder, TypedSimpleGraph tsg) throws IOException { File author = new File(dataFolder.getAbsolutePath() + "\\" + "Author.csv"); File conference = new File(dataFolder.getAbsolutePath() + "\\" + "Conference.csv"); File journal = new File(dataFolder.getAbsolutePath() + "\\" + "Journal.csv"); File paper = new File(dataFolder.getAbsolutePath() + "\\" + "sanitizedPaper.csv"); File paperAuthor = new File(dataFolder.getAbsolutePath() + "\\" + "PaperAuthor.csv"); valid = new File(dataFolder.getAbsolutePath() + "\\" + "Valid.csv"); // load author BufferedReader br = new BufferedReader(new FileReader(author)); String line = ""; br.readLine(); // eat the first line while ((line = br.readLine()) != null) { String[] authorline = line.split(csvregex); Author a; if (authorline.length >= 2) { a = new Author(Integer.parseInt(authorline[0]), authorline[1]); } else { a = new Author(Integer.parseInt(authorline[0]), ""); } tsg.addVertex(a); authorMap.put(Integer.parseInt(authorline[0]), a); if (authorline.length == 3) { Affiliation affil = new Affiliation(authorline[2]); tsg.addVertex(affil); tsg.addEdge(a, affil); } } br.close(); // load conference br = new BufferedReader(new FileReader(conference)); line = ""; br.readLine(); // eat the first line int journalOffset = 0; // id shortname, fullname, homepage while ((line = br.readLine()) != null) { String[] confline = line.split(csvregex); Venue v = new Venue(Integer.parseInt(confline[0]), confline[1].replace("\"", ""), confline[2].replace("\"", "")); tsg.addVertex(v); journalOffset++; venueMap.put(Integer.parseInt(confline[0]), v); if (confline.length == 4) { URI url = null; try { url = new URI(confline[3].replace("\"", "")); } catch (URISyntaxException e) { // eat } if (url != null) { VenueWebPage u = new VenueWebPage(url); tsg.addVertex(u); tsg.addEdge(v, u); } } } br.close(); // load journal br = new BufferedReader(new FileReader(journal)); line = ""; br.readLine(); // eat the first line // id shortname, fullname, homepage while ((line = br.readLine()) != null) { String[] jline = line.split(csvregex); Venue v = new Venue(Integer.parseInt(jline[0]) + journalOffset, jline[1].replace("\"", ""), jline[2].replace("\"", "")); tsg.addVertex(v); venueMap.put(Integer.parseInt(jline[0]) + journalOffset, v); if (jline.length == 4) { URI url = null; try { url = new URI(jline[3].replace("\"", "")); } catch (URISyntaxException e) { // eat } if (url != null) { VenueWebPage u = new VenueWebPage(url); tsg.addVertex(u); tsg.addEdge(v, u); } } } br.close(); // load paper br = new BufferedReader(new FileReader(paper)); line = ""; int count = 1000000; int i = 0; int perc = 0; // br.readLine(); // eat the first line // Id Title Year ConferenceId JournalId Keyword while ((line = br.readLine()) != null) { while (line.length() - line.replaceAll(csvregex, "").length() < 5) { String newline = ltrim(br.readLine()); line = line + " " + newline; } if (perc < (++i / (float) count) * 100) { logger.info(++perc + "%"); } if (perc > 4) break; String[] pline = line.split(csvregex); Paper p; if (pline.length == 6) { p = new Paper(Integer.parseInt(pline[0]), pline[1].replaceAll( "\"", "").toLowerCase(), pline[5]); } else { p = new Paper(Integer.parseInt(pline[0]), pline[1].replaceAll( "\"", "").toLowerCase(), null); } paperMap.put(Integer.parseInt(pline[0]), p); tsg.addVertex(p); try { Integer year = Integer.parseInt(pline[2]); if (year != 0) { Year y = new Year(year); tsg.addVertex(y); tsg.addEdge(p, y); } } catch (Exception e) { System.out.println(line); } Integer confid = Integer.parseInt(pline[3]); Integer jid = Integer.parseInt(pline[4]); if (!(confid == -1 || jid == -1)) { if (confid == 0) { tsg.addEdge(p, venueMap.get(jid + journalOffset)); } else { if (venueMap.containsKey(confid) == true) { tsg.addEdge(p, venueMap.get(confid)); } } } String[] terms = pline[1].replaceAll("\"", "").split("\\W++"); for (String t : terms) { Term term = new Term(t.toLowerCase().trim()); tsg.addVertex(term); tsg.addEdge(p, term); } if (pline.length == 6) { terms = pline[5].split("\\W++"); for (String t : terms) { Term term = new Term(t.toLowerCase().trim()); tsg.addVertex(term); tsg.addEdge(p, term); } } } br.close(); // load paper-Author br = new BufferedReader(new FileReader(paperAuthor)); line = ""; count = 12000000; i = 0; perc = 0; br.readLine(); // eat the first line // paperid, authorid, authorname, authoraffil while ((line = br.readLine()) != null) { while (line.length() - line.replaceAll(csvregex, "").length() < 3) { String newline = ltrim(br.readLine()); line = line + " " + newline; } if (perc < (++i / (float) count) * 100) { logger.info(++perc + "%"); } if (perc > 4) break; String[] paline = line.split(csvregex); Paper p = paperMap.get(Integer.parseInt(paline[0])); if (p == null){ p = new Paper(Integer.parseInt(paline[0]), "", ""); paperMap.put(p.getId(), p); tsg.addVertex(p); } Author a = authorMap.get(Integer.parseInt(paline[1])); if (a == null){ int aid = Integer.parseInt(paline[1]); a = new Author(aid, ""); authorMap.put(aid, a); tsg.addVertex(a); } tsg.addEdge(p, a); if (paline.length == 3) { AuthorAlsoKnownAs aka = new AuthorAlsoKnownAs(paline[2]); tsg.addVertex(aka); tsg.addEdge(a, aka); } if (paline.length == 4) { Affiliation affil = new Affiliation(paline[3]); tsg.addVertex(affil); tsg.addEdge(a, affil); } } br.close(); // load train } private static String ltrim(String s) { int i = 0; while (i < s.length() && Character.isWhitespace(s.charAt(i))) { i++; } return s.substring(i); } public static void main(String[] args) { TypedSimpleGraph tsg = null;// = new TypedSimpleGraph(TypedEdge.class); List<Map<Integer, ? extends Type>> mapList = null; Kryo kryo = new Kryo(); try { NINALogger.setup(); if (serial.exists()) { try { Input input = new Input(new FileInputStream(serial)); Input inputMap = new Input(new FileInputStream(serialMap)); tsg = kryo.readObject(input, TypedSimpleGraph.class); mapList = kryo.readObject(inputMap, ArrayList.class); authorMap = (Map<Integer, Author>) mapList.get(0); paperMap = (Map<Integer, Paper>) mapList.get(1); venueMap = (Map<Integer, Venue>) mapList.get(2); input.close(); inputMap.close(); } catch (Exception e) { System.out .println("Exception during deserialization: " + e); System.exit(0); } } else { tsg = new TypedSimpleGraph(TypedEdge.class); loadKDDCupGraphFromFolder(dataFolder, tsg); } PrintWriter pwTrain = new PrintWriter(dataFolder + "/AP_train.csv"); ConstrainedRandomWalkWithRestart crwr = new ConstrainedRandomWalkWithRestart( tsg, 0.15f); // for each training pair BufferedReader br = new BufferedReader(new FileReader(train)); String line = ""; br.readLine(); // eat the first line // Create the pipeline that will take as input {data = File, target // = String for classname} // and turn them into {data = FeatureVector, target = Label} //Pipe instancePipe = new SerialPipes(new Pipe[] { // new Target2Label(), new Array2FeatureVector() }); // Create an empty list of the training instances //InstanceList ilist = new InstanceList(instancePipe); int count = 3740; int i = 0; int perc = 0; // authorid, confirmedpaperid, deletedpaperid while ((line = br.readLine()) != null) { if (perc < (++i / (float) count) * 100) { logger.info(++perc + "%"); } String[] tline = line.split(csvregex); Author a = authorMap.get(Integer.parseInt(tline[0])); if (a == null) logger.severe("Author is null"); MetaPath mp = new MetaPath(a); mp.addToPath(Paper.class); Triple<Map<Type, Integer>, Float, Map<Type, Moment>> ap = crwr.allTopoCounts(mp); mp = new MetaPath(a); mp.addToPath(Paper.class); mp.addToPath(Term.class); mp.addToPath(Paper.class); Triple<Map<Type, Integer>, Float, Map<Type, Moment>> aptp = crwr.allTopoCounts(mp, 10); for (String pid : tline[1].split(" ")) { Integer pi = Integer.parseInt(pid); Paper p = paperMap.get(pi); if (p == null) continue; // a -> p = 1 Integer apI = ap.v1.get(p); ap.v3.get(p).def(); Integer aptpI = aptp.v1.get(p); Float aptpRWI = 0f; if(aptpI == null){ aptpI = 0; aptp.v2 = 1f; }else{ aptp.v3.get(p).def(); aptpRWI = aptp.v3.get(p).getMean(); } pwTrain.println(a.getUniqueIdentifier().replaceFirst("<Author>", "") + "," + apI + "," + ((double) apI) / ap.v2 + "," + ap.v3.get(p).getMean() + "," + aptpI + "," + ((double) aptpI) / aptp.v2 + "," + aptpRWI + ",1"); } for (String pid : tline[2].split(" ")) { Integer pi = Integer.parseInt(pid); Paper p = paperMap.get(pi); if (p == null) continue; // a -> p = 0 Integer apI = ap.v1.get(p); ap.v3.get(p).def(); Integer aptpI = aptp.v1.get(p); Float aptpRWI = 0f; if(aptpI == null){ aptpI = 0; aptp.v2 = 1f; }else{ aptp.v3.get(p).def(); aptp.v2 = (float)aptpI; aptpRWI = aptp.v3.get(p).getMean(); } pwTrain.println(a.getUniqueIdentifier().replaceFirst("<Author>", "") + "," + apI + "," + ((double) apI) / ap.v2 + "," + ap.v3.get(p).getMean() + "," + aptpI + "," + ((double) aptpI) / aptp.v2 + "," + aptpRWI + ",0"); } } br.close(); // Create a classifier trainer, and use it to create a classifier //ClassifierTrainer naiveBayesTrainer = new NaiveBayesTrainer(); //Classifier classifier = naiveBayesTrainer.train(ilist); //System.out.println("The training accuracy is " // + classifier.getAccuracy(ilist)); // // // // // // // for each training pair br = new BufferedReader(new FileReader(valid)); line = ""; br.readLine(); // eat the first line count = 1497; i = 0; perc = 0; PrintWriter pwTest = new PrintWriter(dataFolder + "/AP_test.csv"); // authorid, confirmedpaperid, deletedpaperid while ((line = br.readLine()) != null) { if (perc < (++i / (float) count) * 100) { logger.info(++perc + "%"); } String[] tline = line.split(csvregex); Author a = authorMap.get(Integer.parseInt(tline[0])); if (a == null) continue; MetaPath mp = new MetaPath(a); mp.addToPath(Paper.class); Triple<Map<Type, Integer>, Float, Map<Type, Moment>> ap = crwr.allTopoCounts(mp); mp = new MetaPath(a); mp.addToPath(Paper.class); mp.addToPath(Term.class); mp.addToPath(Paper.class); Triple<Map<Type, Integer>, Float, Map<Type, Moment>> aptp = crwr.allTopoCounts(mp, 10); TreeMap<Double, Integer> output = new TreeMap<Double, Integer>(); //List<Integer> negClass = new ArrayList<Integer>(); for (Type t : Graphs.neighborListOf(tsg, a)) { if (!(t instanceof Paper)) continue; Paper p = (Paper) t; Integer apI = ap.v1.get(p); ap.v3.get(p).def(); Integer aptpI = aptp.v1.get(p); Float aptpRWI = 0f; if(aptpI == null){ aptpI = 0; aptp.v2 = 1f; }else{ aptp.v3.get(p).def(); aptpRWI = aptp.v3.get(p).getMean(); } //InstanceList iListTest = new InstanceList(instancePipe); pwTest.println(a.getUniqueIdentifier().replaceFirst("<Author>", "") + "," + apI +","+ ((double) apI) / ap.v2+","+ ap.v3.get(p).getMean()+","+ aptpI+","+ ((double) aptpI) / aptp.v2+","+ aptpRWI+",1"); //LabelVector l = classifier.classify(iListTest).get(0).getLabelVector(); //output.put(l.value(0), p.getId()); } //String sb = new String(); // for (Entry<Double, Integer> e : output.entrySet()) { // sb = e.getValue() + " " + sb; // } // pw.println(sb); } br.close(); pwTrain.close(); pwTest.close(); if (!serial.exists()) { // Object serialization try { Output output = new Output(new FileOutputStream(serial)); Output outputMap = new Output(new FileOutputStream( serialMap)); kryo.writeObject(output, tsg); List<Map<Integer, ? extends Type>> o = new ArrayList<Map<Integer, ? extends Type>>(); o.add(authorMap); o.add(paperMap); o.add(venueMap); kryo.writeObject(outputMap, o); output.close(); outputMap.close(); } catch (Exception e) { System.out.println("Exception during serialization: " + e); System.exit(0); } } } catch (IOException e) { e.printStackTrace(); } } }