package table.imdb;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import experiments.table.limaye.corrected.Table;
import experiments.table.limaye.corrected.Table.Column;
import experiments.table.limaye.corrected.Table.Column.Cell;
public class MusicBrainzConverter {
public static String RAWDIRECTORY = "/home/quh/Arbeitsfläche/Table Disambiguation Data sets/musicBrainz_raw/";
public static String TRIPPLEFILE = "/home/quh/Arbeitsfläche/Table Disambiguation Data sets/freebase_links_en.nt";
public static String GTDIRECTORY = "/home/quh/Arbeitsfläche/Table Disambiguation Data sets/musicbrainz_entity_keys/";
public static final String OUTPUTFILE = "/home/quh/Arbeitsfläche/Table Disambiguation Data sets/musicbrainz_columns.txt";
private HashMap<String, String> uriconversion;
private HashMap<String, String> groundtruth;
private PrintWriter writer;
private String filename;
public MusicBrainzConverter() {
super();
this.filename = "";
this.uriconversion = new HashMap<String, String>();
try {
writer = new PrintWriter(new File(OUTPUTFILE));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public void readFile(File file) {
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++) {
this.filename = files[i].getName();
File gtfile = new File(GTDIRECTORY + files[i].getName() + ".keys");
readGroundtruthFile(gtfile);
processFile(files[i]);
}
}
public void processFile(File input) {
String c = "";
try {
String line = null;
BufferedReader reader = new BufferedReader(new FileReader(input));
while ((line = reader.readLine()) != null) {
c += line;
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}
convert(c);
}
public void convert(String html) {
Converter parser = new Converter(groundtruth);
Reader in = new StringReader(html);
try {
// the HTML to convert
parser.parse(in);
} catch (Exception e) {
} finally {
try {
in.close();
} catch (IOException ioe) {
// this should never happen
}
}
parser.integrateGT();
Table t = parser.getTable();
int colnr = t.getNumberofColumns();
int max = 2;
if (colnr < max) {
max = colnr;
}
for (int i = 0; i < max; i++) {
writer.append(filename);
writer.append(System.lineSeparator());
Column c = t.getColumn(i);
List<Cell> cellList = c.getCellList();
for (Cell cell : cellList) {
writer.append(cell.getCellContent().replaceAll("\\n\\r", "") + "\t" + cell.getGt());
writer.append(System.lineSeparator());
}
writer.append(System.lineSeparator());
}
}
public void readTripples() {
File nfile = new File(TRIPPLEFILE);
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(nfile));
String line = null;
reader.readLine();
while ((line = reader.readLine()) != null) {
String[] splitter = line.split(" ");
String freebaseOrig = splitter[splitter.length - 2];
// Freebase uri
freebaseOrig = freebaseOrig.replaceAll("<http://rdf.freebase.com/ns", "").replaceAll(">", "")
.replaceAll("\\.", "/");
String dbpediaUri = splitter[0];
dbpediaUri = dbpediaUri.replaceAll("<|>", "");
if (uriconversion.containsKey(freebaseOrig)) {
String uris = uriconversion.get(freebaseOrig);
uris += "," + dbpediaUri;
uriconversion.put(freebaseOrig, uris);
} else {
uriconversion.put(freebaseOrig, dbpediaUri);
}
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
public void readGroundtruthFile(File gt) {
BufferedReader reader = null;
this.groundtruth = new HashMap<String, String>();
try {
reader = new BufferedReader(new FileReader(gt));
String line = null;
while ((line = reader.readLine()) != null) {
String row = line.split("=")[0];
String freebaseGT = line.replaceAll(".*=", "");
freebaseGT = freebaseGT.substring(0, freebaseGT.length() - 1);
groundtruth.put(row, freebaseGT);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
class Converter extends HTMLEditorKit.ParserCallback {
private boolean isCorrectTable = false;
private boolean checkText = false;
private boolean isCorrectCell = false;
private int columnCounter = 0;
private Table table;
Map<String, String> groundtruth;
Converter(HashMap<String, String> groundtruth) {
super();
this.groundtruth = groundtruth;
this.table = new Table();
}
public void parse(Reader in) throws IOException {
ParserDelegator delegator = new ParserDelegator();
// the third parameter is TRUE to ignore charset directive
delegator.parse(in, this, Boolean.TRUE);
}
public Table getTable() {
return table;
}
public void integrateGT() {
for (Map.Entry<String, String> entry : groundtruth.entrySet()) {
String pos = entry.getKey();
String[] splitter = pos.split(",");
int columnNr = Integer.valueOf(splitter[1]);
int cellNr = Integer.valueOf(splitter[0]);
Column col = table.getColumn(columnNr);
if (col != null) {
List<Cell> cellList = col.getCellList();
if (uriconversion.containsKey(entry.getValue())) {
String wikigt = uriconversion.get(entry.getValue());
cellList.get(cellNr).setGt(wikigt);
} else {
cellList.get(cellNr).setGt("NULL");
}
}
}
}
private boolean isRelevantTable(String table) {
return table.contains("tbl");
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t.toString().equals("table") && isRelevantTable(a.toString())) {
isCorrectTable = true;
} else if (isCorrectTable && t.toString().equals("tr")) {
} else if (isCorrectTable && t.toString().equals("td")) {
isCorrectCell = true;
} else if (isCorrectTable && t.toString().equals("a") && isCorrectCell) {
int tablecols = table.getNumberofColumns();
if (tablecols < (columnCounter + 1)) {
System.out.println("ALSO BEIM ADDEN SIND WIR");
table.addColumn("");
}
columnCounter++;
checkText = true;
}
}
public void handleEndTag(HTML.Tag t, int pos) {
if (t.toString().equals("table") && isCorrectTable) {
isCorrectTable = false;
} else if (t.toString().equals("a") && isCorrectTable && checkText && isCorrectCell) {
checkText = false;
} else if (t.toString().equals("td") && isCorrectTable) {
isCorrectCell = false;
} else if (t.toString().equals("tr") && isCorrectTable) {
System.out.println("ICH REETTTTTTEEEE");
columnCounter = 0;
}
}
public void handleText(char[] text, int pos) {
String s = new String(text);
if (checkText) {
Column c = table.getColumn(columnCounter - 1);
System.out.println(columnCounter - 1);
System.out.println(s);
c.addCell(s);
}
}
}
public static void main(String args[]) {
MusicBrainzConverter imdbConverter = new MusicBrainzConverter();
imdbConverter.readTripples();
System.out.println("Finished Tripple Reading");
imdbConverter.readFile(new File(RAWDIRECTORY));
}
}