/*******************************************************************************
* Copyright 2013
* TU Darmstadt, FG Sprachtechnologie
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.bigdata.io.hadoop;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.StringWriter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.io.input.CountingInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.w3c.dom.CharacterData;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* Creates LeipzigRecordReader for Leipzig corpora
*
* @author Johannes Simon
*
*/
public class LeipzigInputFormat extends FileInputFormat<Text, CrawlerRecord> {
/**
* Parse and modify source metadata as given in Leipzig corpora.
*
* @author LSW
*
*/
public static class SourceMetadata {
private Document doc;
public SourceMetadata() {
// Initialize with valid placeholder meta data XML
try {
loadXml("<source><location>null</location><date>null</date><user>null</user><original_encoding>null</original_encoding><language>null</language><issue>null</issue></source>");
} catch (SAXException e) {
e.printStackTrace();
}
}
public SourceMetadata(String xml) throws SAXException {
loadXml(xml);
}
private void loadXml(String xml) throws SAXException {
if (!xml.contains("<location><![CDATA[")) {
xml = xml.replace("<location>", "<location><![CDATA[");
xml = xml.replace("</location>", "]]></location>");
}
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
InputSource is = new InputSource();
is.setCharacterStream(new StringReader(xml));
doc = db.parse(is);
} catch (ParserConfigurationException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public String getEntry(String name) {
try {
NodeList nodes = doc.getElementsByTagName(name);
Element line = (Element) nodes.item(0);
return getCharacterDataFromElement(line);
} catch (NullPointerException e) {
return null;
}
}
public void setEntry(String name, String entry) {
try {
NodeList nodes = doc.getElementsByTagName(name);
Element item = (Element) nodes.item(0);
Node child = item.getFirstChild();
child.setNodeValue(entry);
} catch (NullPointerException e) {
System.out.println("DocumentMetadata: could not write to " + name + " - " + entry);
}
}
private String getCharacterDataFromElement(Element e) {
Node child = e.getFirstChild();
if (child instanceof CharacterData) {
CharacterData cd = (CharacterData) child;
return cd.getData();
}
return "?";
}
public String getXMLString() {
Transformer transformer;
try {
transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "no");
StreamResult result = new StreamResult(new StringWriter());
DOMSource source = new DOMSource(doc);
transformer.transform(source, result);
String xmlString = result.getWriter().toString();
xmlString = xmlString.replace("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>", "");
if (!xmlString.contains("<location><![CDATA[")) {
xmlString = xmlString.replace("<location>", "<location><![CDATA[");
xmlString = xmlString.replace("</location>", "]]></location>");
}
return xmlString;
} catch (TransformerConfigurationException e) {
e.printStackTrace();
} catch (TransformerFactoryConfigurationError e) {
e.printStackTrace();
} catch (TransformerException e) {
e.printStackTrace();
}
return null;
}
public static void main(String[] args) {
String data = "<source><location>http://www.bedakafi.ch/anfragen.html</location><date>2011-02-02</date><user>Treasurer</user><original_encoding>utf-8</original_encoding><language>deu</language><issue>encoding</issue></source>";
SourceMetadata dm;
try {
dm = new SourceMetadata(data);
System.out.println(dm.getEntry("location"));
dm.setEntry("location", "http://localhost");
System.out.println(dm.getEntry("issue"));
System.out.println(dm.getEntry("not existent"));
System.out.println(dm.getXMLString());
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
@Override
public RecordReader<Text, CrawlerRecord> getRecordReader(InputSplit inputSplit, JobConf jobConf, Reporter reporter) throws IOException {
return new LeipzigRecordReader((FileSplit) inputSplit, jobConf, reporter);
}
/**
* Reads text corpus entries in Leipzig format
*
* @author Johannes Simon
*
*/
public static class LeipzigRecordReader implements RecordReader<Text, CrawlerRecord> {
private long start;
private long end;
private CountingInputStream countingIs;
private BufferedReader reader;
private long nextRecordStart;
private String currentRecordContent;
private String currentRecordHeader;
private String nextRecordHeader;
private long posInByteStream;
private long posInCharStream;
private final String FILE_ENCODING = "UTF-8";
private FileSplit fileSplit;
org.apache.hadoop.mapred.Counters.Counter skippedRecordCounter = null;
/*
* ======================== RecordReader Logic ============================
*/
enum ProcessingErrorCounters {
SkippedDueToException
}
public LeipzigRecordReader(FileSplit split, JobConf jobConf) throws IOException {
this(split, jobConf, null);
}
public LeipzigRecordReader(FileSplit split, JobConf jobConf, Reporter reporter) throws IOException {
// Remember file split instance for debugging purposes
fileSplit = split;
start = split.getStart();
end = start + split.getLength();
System.out.println("Initializing input reader for input split:");
System.out.println(split);
if (reporter != null) {
skippedRecordCounter = reporter.getCounter(ProcessingErrorCounters.SkippedDueToException);
}
posInByteStream = start;
posInCharStream = 0;
// Open the file and seek to the start of the split
Path file = split.getPath();
FileSystem fs = file.getFileSystem(jobConf);
InputStream is = fs.open(split.getPath());
countingIs = new CountingInputStream(is);
countingIs.skip(start);
reader = new BufferedReader(new InputStreamReader(countingIs, FILE_ENCODING));
// Start with the first valid record after offset "start"
skipToNextRecord(reader);
}
private boolean parseMetaLine(CrawlerRecord value, String line) {
if (line == null) {
System.err.println("[LeipzigInputFormat] Warning: Skipping record because extracted meta line is null!");
return false;
}
if (line.contains("\u0000")) {
System.out.println("[parseMetaLine] Line contains null character!");
System.out.println(line.indexOf('\u0000'));
}
try {
SourceMetadata sm = new SourceMetadata(line);
String origUrl = sm.getEntry("location");
String url;
if (!origUrl.isEmpty() && !origUrl.equalsIgnoreCase("null")) {
url = origUrl;
} else {
// Input format is not responsible for filtering incomplete records!
// Simply set URL to "null" (a valid string, not null!) at this point
url = "null";
}
value.setURL(url);
// Original encoding
String encoding = sm.getEntry("original_encoding");
value.setOriginalEncoding(encoding);
// Date
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
Date parsedDate = null;
String date = sm.getEntry("date");
if (date != null) {
try {
parsedDate = dateFormat.parse(date);
} catch (ParseException e) {
System.err.println("[LeipzigInputFormat] Warning: Can't parse date: " + date);
}
} else {
System.err.println("[LeipzigInputFormat] Warning: Record is missing a date.");
}
value.setDate(parsedDate);
} catch (Exception e) {
System.err.println("[LeipzigInputFormat] Warning: Skipping record because an exception occured while parsing meta line " + line);
System.err.println("File split: " + fileSplit);
System.err.println("posInCharStream: " + posInCharStream);
System.err.println("URL: " + value.getURL());
System.err.println("[LeipzigInputFormat] Exception details: " + e.getMessage());
if (skippedRecordCounter != null)
skippedRecordCounter.increment(1);
return false;
}
return true;
}
public static final String LF = System.getProperty("line.separator");
private boolean hasNext() {
// System.out.println("hasNext: " + nextRecordLine + " != null && " + nextRecordStart + " < " + end);
return nextRecordStart >= 0 && nextRecordStart < end;
}
@Override
public boolean next(Text key, CrawlerRecord value) throws IOException {
if (!hasNext())
return false;
skipToNextRecord(reader);
// Try parsing meta line. If parsing failed, skip to next record, and so on.
while (!parseMetaLine(value, currentRecordHeader)) {
if (hasNext())
skipToNextRecord(reader);
else
return false;
}
value.setContent(currentRecordContent);
key.set(value.getURL());
return true;
}
@Override
public Text createKey() {
return new Text();
}
@Override
public CrawlerRecord createValue() {
return new CrawlerRecord();
}
@Override
public long getPos() throws IOException {
//return countingIs.getCount();
return posInByteStream;
}
@Override
public void close() throws IOException {
countingIs.close();
}
@Override
public float getProgress() throws IOException {
return ((float) (getPos() - start)) / ((float) (end - start));
}
/*
* ======================== ARC Logic ============================
*/
private final String UTF8_BOM = "\uFEFF";
/**
* Reads from <code>input</code> until a valid record meta line was read. Everything
* else is added to <code>buffer</code>
*/
private boolean skipToNextRecord(BufferedReader input) throws IOException {
StringBuffer recordBuffer = new StringBuffer();
nextRecordStart = -1;
// Continue with next record in case exception occurs
String line;
String recordHeaderFound = null;
int newLineBytes = new String("\n").getBytes("UTF-8").length;
boolean foundNewRecord = false;
while ((line = reader.readLine()) != null) {
// BOM fix (its use is discouraged, however it does appear sometimes)
if (line.startsWith(UTF8_BOM))
line = line.substring(1);
if (line.startsWith("<source>")) {
nextRecordStart = posInByteStream;
foundNewRecord = true;
recordHeaderFound = line;
} else {
recordBuffer.append(line + "\n");
}
long lineSizeBytes = line.getBytes("UTF-8").length + newLineBytes;
posInByteStream += lineSizeBytes;
if (foundNewRecord)
break;
}
currentRecordContent = recordBuffer.toString();
currentRecordHeader = nextRecordHeader;
nextRecordHeader = recordHeaderFound;
return foundNewRecord;
}
}
}