package doc2vec.corpuscreation;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.HashMap;
import java.util.HashSet;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import doser.tools.indexcreation.WikiPediaUriConverter;
public class CreateD2VCorpus_Wikipedia_WikiSFContext {
public static final String INDEX = "/home/zwicklbauer/NewIndexTryout";
public static final String WIKIPEDIAPAGESDIR = "/mnt/storage/zwicklbauer/WikiParse/temp/plain_reduced/";
private static String contextFile;
private static String outputFilePath;
private HashSet<String> relevantEntities;
private HashMap<String, String> plainFiles;
public static void main(String[] args) {
contextFile = args[0];
outputFilePath = args[1];
CreateD2VCorpus_Wikipedia_WikiSFContext creation = new CreateD2VCorpus_Wikipedia_WikiSFContext();
creation.action();
}
public CreateD2VCorpus_Wikipedia_WikiSFContext() {
super();
this.plainFiles = new HashMap<String, String>();
this.relevantEntities = new HashSet<String>();
}
private String format(String s) {
s = s.toLowerCase();
s = s.replaceAll("[\\.\\,\\!\\? ]+", " ");
return s;
}
public void action() {
System.out.println("ExtractRelevantEntities");
extractRelevantEntities();
System.out.println("ReadWikiPages");
readWikipediaPages();
System.out.println("WriteOutput");
createOutputFile();
}
public void finalize(PrintWriter writer) {
for(String s : relevantEntities) {
StringBuilder builder = new StringBuilder();
builder.append(s);
builder.append(" ");
String wikiText = "";
wikiText = getWikiText(s);
builder.append(format(wikiText));
if(!wikiText.equalsIgnoreCase("")) {
writer.println(builder.toString());
}
}
}
public void createOutputFile() {
File outputFile = new File(outputFilePath);
PrintWriter writer = null;
BufferedReader reader = null;
try {
writer = new PrintWriter(outputFile);
File f = new File(contextFile);
reader = new BufferedReader(new FileReader(f));
String line = null;
while ((line = reader.readLine()) != null) {
String[] splitter = line.split("\\t");
StringBuilder builder = new StringBuilder();
builder.append(splitter[0]);
builder.append(" ");
builder.append(format(splitter[1]));
builder.append(" ");
String wikiText = getWikiText(splitter[0]);
builder.append(format(wikiText));
writer.println(builder.toString());
this.relevantEntities.remove(splitter[0]);
}
this.finalize(writer);
} catch (IOException e) {
e.printStackTrace();
// }
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (writer != null) {
writer.close();
}
}
this.finalize(writer);
}
private String getWikiText(String entity) {
if(this.plainFiles.containsKey(entity)) {
return this.plainFiles.get(entity);
} else {
return "";
}
}
public void readWikipediaPages() {
File file = new File(WIKIPEDIAPAGESDIR);
int counter = 0;
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++) {
String name = files[i].getName();
String finalLink = WikiPediaUriConverter
.createConformDBpediaUrifromEncodedString(name.replaceAll(
".html", "").replaceAll("'", "%"));
if (relevantEntities.contains(finalLink)) {
StringBuilder builder = new StringBuilder();
String content = "";
try {
BufferedReader reader = new BufferedReader(new FileReader(files[i]));
String line = null;
while ((line = reader.readLine()) != null) {
content += line;
}
reader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
try {
XMLReader xmlReader = XMLReaderFactory.createXMLReader();
PlainTextHandler handler = new PlainTextHandler();
InputSource inputSource = new InputSource(new StringReader(
content));
xmlReader.setContentHandler(handler);
xmlReader.parse(inputSource);
builder.append(handler.getDocumentText());
this.plainFiles.put(finalLink, builder.toString());
// System.out.println(builder.toString());
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
if(counter % 10000 == 0) {
System.out.println(counter);
}
counter++;
}
System.out.println("Overall: "+relevantEntities.size() + "Matching: "+counter);
}
public void extractRelevantEntities() {
File oldIndexFile = new File(INDEX);
IndexReader readerOldIndex = null;
try {
final Directory oldDir = FSDirectory.open(oldIndexFile);
readerOldIndex = DirectoryReader.open(oldDir);
for (int j = 0; j < readerOldIndex.maxDoc(); ++j) {
Document oldDoc = readerOldIndex.document(j);
String ent = oldDoc.get("Mainlink");
this.relevantEntities.add(ent);
}
readerOldIndex.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (readerOldIndex != null) {
try {
readerOldIndex.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
class PlainTextHandler implements ContentHandler {
private StringBuffer documentText;
public PlainTextHandler() {
super();
this.documentText = new StringBuffer();
}
@Override
public void characters(char[] arg0, int arg1, int arg2)
throws SAXException {
this.documentText.append(new String(arg0, arg1, arg2));
}
@Override
public void endDocument() throws SAXException {
}
@Override
public void endElement(String arg0, String arg1, String arg2)
throws SAXException {
}
@Override
public void endPrefixMapping(String arg0) throws SAXException {
}
@Override
public void ignorableWhitespace(char[] arg0, int arg1, int arg2)
throws SAXException {
}
@Override
public void processingInstruction(String arg0, String arg1)
throws SAXException {
}
@Override
public void setDocumentLocator(Locator arg0) {
}
@Override
public void skippedEntity(String arg0) throws SAXException {
}
@Override
public void startDocument() throws SAXException {
}
@Override
public void startElement(String arg0, String arg1, String arg2,
Attributes arg3) throws SAXException {
}
@Override
public void startPrefixMapping(String arg0, String arg1)
throws SAXException {
}
public String getDocumentText() {
return documentText.toString();
}
}
}