package com.formulasearchengine.mathosphere.mlp.text;
import com.google.common.collect.ImmutableMap;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
public class WikidataLinkMap implements Serializable {
private static final Logger LOGGER = LoggerFactory.getLogger(WikidataLinkMap.class);
private final Map<String, String> map;
public WikidataLinkMap(String fn) {
map = buildMap(fn, true);
}
public WikidataLinkMap(String fn, boolean unique) {
map = buildMap(fn, unique);
}
private static Map<String, String> buildMap(String fn, boolean unique) {
Map<String, String> keys = new HashMap<>();
ImmutableMap.Builder<String, String> title2Data = ImmutableMap.builder();
try {
FileReader in = new FileReader(fn);
Iterable<CSVRecord> records = CSVFormat.RFC4180.parse(in);
for (CSVRecord record : records) {
String title = record.get(0);
String item = record.get(1);
if (!unique) {
if (keys.containsKey(title)) {
int itemNew = Integer.parseInt(item.replaceAll("Q(\\d+)", "$1"));
int itemOld = Integer.parseInt(keys.get(title).replaceAll("Q(\\d+)", "$1"));
if (itemNew > itemOld) {
continue;
}
}
keys.put(title, item);
} else {
title2Data.put(title, item);
}
}
} catch (java.io.IOException e) {
LOGGER.error("title2Data-problem");
e.printStackTrace();
}
if (!unique) {
title2Data.putAll(keys);
}
return title2Data.build();
}
public String title2Data(String in) {
in = in.replaceAll("\\[\\[([^\\|]+)\\|?(.*?)\\]\\]", "$1").trim().toLowerCase();
if (map.containsKey(in)) {
return map.get(in);
} else {
// some heuristics to improve mapping
in = in.replaceAll("('s|\\(.*?\\))", "").trim();
}
return map.get(in.trim().toLowerCase());
}
/**
* Writes the list in memory to a file.
*
* @param fn Filename of the output file
* @return boolean if the writing process was successful
*/
public boolean writeFile(String fn) {
try {
OutputStream out = new FileOutputStream(fn);
writeObject(out);
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
private void writeObject(OutputStream out) throws IOException {
OutputStreamWriter writer = new OutputStreamWriter(out);
CSVPrinter printer = CSVFormat.DEFAULT.withRecordSeparator("\n").print(writer);
for (Map.Entry<String, String> m : map.entrySet()) {
String[] output = {m.getKey(), m.getValue()};
printer.printRecord(output);
}
writer.flush();
out.flush();
}
// private void writeObject(java.io.ObjectOutputStream out)
// throws IOException{
// this.writeObject((OutputStream) out);
// }
// private void readObject(java.io.ObjectInputStream in)
// throws IOException, ClassNotFoundException{
//
// }
// private void readObjectNoData()
// throws ObjectStreamException{
//
// }
}