package table.imdb; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; import java.util.Enumeration; import java.util.HashMap; import java.util.Map; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class IMDBTableConverter { public static String RAWDIRECTORY = "/home/quh/Arbeitsfläche/Table Disambiguation Data sets/imdb_raw/"; public static String TRIPPLEFILE = "/home/quh/Arbeitsfläche/Table Disambiguation Data sets/freebase_links_en.nt"; public static String GTDIRECTORY = "/home/quh/Arbeitsfläche/Table Disambiguation Data sets/imdb_entity_keys/"; public static final String OUTPUTFILE = "/home/quh/Arbeitsfläche/Table Disambiguation Data sets/imdb_columns.txt"; private HashMap<String, String> uriconversion; private HashMap<Integer, String> groundtruth; private PrintWriter writer; public IMDBTableConverter() { super(); this.uriconversion = new HashMap<String, String>(); try { writer = new PrintWriter(new File(OUTPUTFILE)); } catch (FileNotFoundException e) { e.printStackTrace(); } } public void readFile(File file) { File[] files = file.listFiles(); for (int i = 0; i < files.length; i++) { writer.append(files[i].getName()); writer.append(System.lineSeparator()); File gtfile = new File(GTDIRECTORY + files[i].getName() + ".keys"); // System.out.println(GTDIRECTORY + files[i].getName() + ".keys"); readGroundtruthFile(gtfile); processFile(files[i]); writer.append(System.lineSeparator()); } } public void processFile(File input) { String c = ""; try { String line = null; BufferedReader reader = new BufferedReader(new FileReader(input)); while ((line = reader.readLine()) != null) { c += line; } reader.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e1) { e1.printStackTrace(); } convert(c); } public void convert(String html) { Converter parser = new Converter(writer, groundtruth); Reader in = new StringReader(html); try { // the HTML to convert parser.parse(in); } catch (Exception e) { } finally { try { in.close(); } catch (IOException ioe) { // this should never happen } } } public void readTripples() { File nfile = new File(TRIPPLEFILE); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(nfile)); String line = null; reader.readLine(); while ((line = reader.readLine()) != null) { String[] splitter = line.split(" "); String freebaseOrig = splitter[splitter.length - 2]; // Freebase uri freebaseOrig = freebaseOrig.replaceAll("<http://rdf.freebase.com/ns", "").replaceAll(">", "") .replaceAll("\\.", "/"); String dbpediaUri = splitter[0]; dbpediaUri = dbpediaUri.replaceAll("<|>", ""); if(uriconversion.containsKey(freebaseOrig)) { String uris = uriconversion.get(freebaseOrig); uris += ","+dbpediaUri; uriconversion.put(freebaseOrig, uris); } else { uriconversion.put(freebaseOrig, dbpediaUri); } } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } public void readGroundtruthFile(File gt) { BufferedReader reader = null; this.groundtruth = new HashMap<Integer, String>(); try { reader = new BufferedReader(new FileReader(gt)); String line = null; while ((line = reader.readLine()) != null) { int row = Integer.valueOf(line.split(",")[0]); // int row = Integer.valueOf(line.replaceAll(",*", "")); String freebaseGT = line.replaceAll(".*=", ""); freebaseGT = freebaseGT.substring(0, freebaseGT.length() - 1); groundtruth.put(row, freebaseGT); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } } class Converter extends HTMLEditorKit.ParserCallback { private boolean isRelevantCell = false; private boolean isCorrectTable = false; private boolean checkText = false; private int rowCounter = -1; private PrintWriter writer; Map<Integer, String> groundtruth; Converter(PrintWriter writer, HashMap<Integer, String> groundtruth) { super(); this.writer = writer; this.groundtruth = groundtruth; } public void parse(Reader in) throws IOException { ParserDelegator delegator = new ParserDelegator(); // the third parameter is TRUE to ignore charset directive delegator.parse(in, this, Boolean.TRUE); } private boolean isRelevantTable(String table) { return table.contains("cast_list"); } public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t.toString().equals("table") && isRelevantTable(a.toString())) { isCorrectTable = true; rowCounter = -1; } else if (isCorrectTable && t.toString().equals("a")) { @SuppressWarnings("rawtypes") Enumeration e = a.getAttributeNames(); boolean isUrl = false; boolean ishref = false; while (e.hasMoreElements()) { Object obj = e.nextElement(); if (obj.toString().equalsIgnoreCase("href")) { ishref = true; } else if (obj.toString().equalsIgnoreCase("itemprop")) { String prop = a.getAttribute(obj).toString(); if (prop.equals("url")) { isUrl = true; } } } if (isUrl && ishref) { isRelevantCell = true; } } else if (isCorrectTable && isRelevantCell && t.toString().equals("span")) { @SuppressWarnings("rawtypes") Enumeration e = a.getAttributeNames(); boolean isName = false; while (e.hasMoreElements()) { Object obj = e.nextElement(); if (obj.toString().equalsIgnoreCase("itemprop")) { String prop = a.getAttribute(obj).toString(); if (prop.equals("name")) { isName = true; break; } } } if (isName) { rowCounter++; checkText = true; } } } public void handleEndTag(HTML.Tag t, int pos) { if (t.toString().equals("table") && isCorrectTable) { isCorrectTable = false; } else if (t.toString().equals("a") && isCorrectTable && isRelevantCell) { isRelevantCell = false; } else if (t.toString().equals("span") && isCorrectTable && isRelevantCell && checkText) { checkText = false; } } public void handleText(char[] text, int pos) { String s = new String(text); if (checkText) { writer.append(s); writer.append("\t"); String gt = this.groundtruth.get(rowCounter); System.out.println(this.groundtruth.toString()); System.out.println(rowCounter); String set = uriconversion.get(gt); if (set == null) { System.out.println("Appebden tun mir"); writer.append("\n"); } else { StringBuilder builder = new StringBuilder(); // for (String se : set) { builder.append(set); // } String convertedGt = builder.toString(); writer.append(convertedGt); writer.append("\n"); } writer.flush(); } } } public static void main(String args[]) { IMDBTableConverter imdbConverter = new IMDBTableConverter(); imdbConverter.readTripples(); System.out.println("Finished Tripple Reading"); imdbConverter.readFile(new File(RAWDIRECTORY)); } }