LeipzigInputFormat.java example

Explorer

dkpro-bigdata-master
- dkpro-bigdata-collocations
  - src
    - main
      - java
        org
        dkpro
        bigdata
        collocations
        AssocReducer.java
        AssociationMetrics.java
        CollocCombiner.java
        CollocDriver.java
        CollocMapper.java
        CollocReducer.java
        Gram.java
        GramKey.java
        GramKeyGroupComparator.java
        GramKeyPartitioner.java
- dkpro-bigdata-examples
  - src
    - main
      - java
        org
        dkpro
        bigdata
        examples
        CasConsumerExample.java
        ExternalDataExample.java
        Text2CASExample.java
        UimaPipelineOnHadoop.java
- dkpro-bigdata-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        hadoop
        AnalysisEngineUtil.java
        DkproHadoopDriver.java
        DkproMapper.java
        DkproReducer.java
        EngineFactory.java
        FeatureCountHadoopDriver.java
        UIMAMapReduceBase.java
        XMLDescriptorRunner.java
    - test
      - java
        org
        dkpro
        bigdata
        hadoop
        CasConsumerOutputTest.java
        DkproHadoopDriverTest.java
- dkpro-bigdata-io-hadoop
  - src
    - main
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormat.java
        BinCasWithTypeSystemWritable.java
        BinCasWritable.java
        CASWritable.java
        CASWritableSequenceFileWriter.java
        CollectionReaderWrapper.java
        CrawlerRecord.java
        DummyEncodingDetector.java
        EncodingDetector.java
        FormatConverterMapper.java
        GenericKeyValueLineRecordReader.java
        GenericMultiLineRecordReader.java
        HdfsResourceLoaderLocator.java
        LeipzigInputFormat.java
        MultiLineText2CASInputFormat.java
        Text2CASInputFormat.java
        WARCInputFormat.java
        XCASSequenceFileWriter.java
        XmiSequenceFileWriter.java
    - test
      - java
        org
        dkpro
        bigdata
        io
        hadoop
        ARCInputFormatTest.java
        BinCasWithTypeSystemWritableTest.java
        BinCasWritableTest.java
        CASWritableTest.java
        HdfsResourceLoaderLocatorTest.java
        InputFormatTest.java
        LeipzigInputFormatTest.java
        WARCInputFormatTest.java

/*******************************************************************************
 * Copyright 2013
 * TU Darmstadt, FG Sprachtechnologie
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.dkpro.bigdata.io.hadoop;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.StringWriter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.io.input.CountingInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.w3c.dom.CharacterData;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
 * Creates LeipzigRecordReader for Leipzig corpora
 * 
 * @author Johannes Simon
 *
 */
public class LeipzigInputFormat extends FileInputFormat<Text, CrawlerRecord> {
	/**
	 * Parse and modify source metadata as given in Leipzig corpora.
	 * 
	 * @author LSW
	 * 
	 */
	public static class SourceMetadata {

		private Document doc;

		public SourceMetadata() {
			// Initialize with valid placeholder meta data XML
			try {
				loadXml("<source><location>null</location><date>null</date><user>null</user><original_encoding>null</original_encoding><language>null</language><issue>null</issue></source>");
			} catch (SAXException e) {
				e.printStackTrace();
			}
		}

		public SourceMetadata(String xml) throws SAXException {
			loadXml(xml);
		}

		private void loadXml(String xml) throws SAXException {

			if (!xml.contains("<location><![CDATA[")) {
				xml = xml.replace("<location>", "<location><![CDATA[");
				xml = xml.replace("</location>", "]]></location>");
			}

			try {
				DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
				DocumentBuilder db = dbf.newDocumentBuilder();
				InputSource is = new InputSource();
				is.setCharacterStream(new StringReader(xml));

				doc = db.parse(is);
			} catch (ParserConfigurationException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		public String getEntry(String name) {
			try {
				NodeList nodes = doc.getElementsByTagName(name);
				Element line = (Element) nodes.item(0);
				return getCharacterDataFromElement(line);
			} catch (NullPointerException e) {
				return null;
			}
		}

		public void setEntry(String name, String entry) {
			try {
				NodeList nodes = doc.getElementsByTagName(name);
				Element item = (Element) nodes.item(0);

				Node child = item.getFirstChild();

				child.setNodeValue(entry);

			} catch (NullPointerException e) {
				System.out.println("DocumentMetadata: could not write to " + name + " - " + entry);
			}
		}

		private String getCharacterDataFromElement(Element e) {
			Node child = e.getFirstChild();
			if (child instanceof CharacterData) {
				CharacterData cd = (CharacterData) child;
				return cd.getData();
			}
			return "?";
		}

		public String getXMLString() {

			Transformer transformer;

			try {
				transformer = TransformerFactory.newInstance().newTransformer();
				transformer.setOutputProperty(OutputKeys.INDENT, "no");
				StreamResult result = new StreamResult(new StringWriter());
				DOMSource source = new DOMSource(doc);
				transformer.transform(source, result);

				String xmlString = result.getWriter().toString();

				xmlString = xmlString.replace("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>", "");

				if (!xmlString.contains("<location><![CDATA[")) {
					xmlString = xmlString.replace("<location>", "<location><![CDATA[");
					xmlString = xmlString.replace("</location>", "]]></location>");
				}

				return xmlString;
			} catch (TransformerConfigurationException e) {
				e.printStackTrace();
			} catch (TransformerFactoryConfigurationError e) {
				e.printStackTrace();
			} catch (TransformerException e) {
				e.printStackTrace();
			}

			return null;

		}

		public static void main(String[] args) {
			String data = "<source><location>http://www.bedakafi.ch/anfragen.html</location><date>2011-02-02</date><user>Treasurer</user><original_encoding>utf-8</original_encoding><language>deu</language><issue>encoding</issue></source>";

			SourceMetadata dm;
			try {
				dm = new SourceMetadata(data);

				System.out.println(dm.getEntry("location"));

				dm.setEntry("location", "http://localhost");
				System.out.println(dm.getEntry("issue"));
				System.out.println(dm.getEntry("not existent"));

				System.out.println(dm.getXMLString());
			} catch (SAXException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}

	@Override
	public RecordReader<Text, CrawlerRecord> getRecordReader(InputSplit inputSplit, JobConf jobConf, Reporter reporter) throws IOException {
		return new LeipzigRecordReader((FileSplit) inputSplit, jobConf, reporter);
	}

	/**
	 * Reads text corpus entries in Leipzig format
	 * 
	 * @author Johannes Simon
	 *
	 */
	public static class LeipzigRecordReader implements RecordReader<Text, CrawlerRecord> {
		private long start;
		private long end;
		private CountingInputStream countingIs;
		private BufferedReader reader;

		private long nextRecordStart;

		private String currentRecordContent;
		private String currentRecordHeader;
		private String nextRecordHeader;

		private long posInByteStream;
		private long posInCharStream;

		private final String FILE_ENCODING = "UTF-8";

		private FileSplit fileSplit;

		org.apache.hadoop.mapred.Counters.Counter skippedRecordCounter = null;

		/*
		 * ======================== RecordReader Logic ============================
		 */

		enum ProcessingErrorCounters {
			SkippedDueToException
		}

		public LeipzigRecordReader(FileSplit split, JobConf jobConf) throws IOException {
			this(split, jobConf, null);
		}

		public LeipzigRecordReader(FileSplit split, JobConf jobConf, Reporter reporter) throws IOException {
			// Remember file split instance for debugging purposes
			fileSplit = split;
			start = split.getStart();
			end = start + split.getLength();
			System.out.println("Initializing input reader for input split:");
			System.out.println(split);

			if (reporter != null) {
				skippedRecordCounter = reporter.getCounter(ProcessingErrorCounters.SkippedDueToException);
			}

			posInByteStream = start;
			posInCharStream = 0;

			// Open the file and seek to the start of the split
			Path file = split.getPath();
			FileSystem fs = file.getFileSystem(jobConf);
			InputStream is = fs.open(split.getPath());
			countingIs = new CountingInputStream(is);
			countingIs.skip(start);
			reader = new BufferedReader(new InputStreamReader(countingIs, FILE_ENCODING));
			// Start with the first valid record after offset "start"
			skipToNextRecord(reader);
		}

		private boolean parseMetaLine(CrawlerRecord value, String line) {
			if (line == null) {
				System.err.println("[LeipzigInputFormat] Warning: Skipping record because extracted meta line is null!");
				return false;
			}
			if (line.contains("\u0000")) {
				System.out.println("[parseMetaLine] Line contains null character!");
				System.out.println(line.indexOf('\u0000'));
			}
			try {
				SourceMetadata sm = new SourceMetadata(line);
				String origUrl = sm.getEntry("location");
				String url;
				if (!origUrl.isEmpty() && !origUrl.equalsIgnoreCase("null")) {
					url = origUrl;
				} else {
					// Input format is not responsible for filtering incomplete records!
					// Simply set URL to "null" (a valid string, not null!) at this point
					url = "null";
				}
				value.setURL(url);

				// Original encoding
				String encoding = sm.getEntry("original_encoding");
				value.setOriginalEncoding(encoding);

				// Date
				SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
				Date parsedDate = null;
				String date = sm.getEntry("date");
				if (date != null) {
					try {
						parsedDate = dateFormat.parse(date);
					} catch (ParseException e) {
						System.err.println("[LeipzigInputFormat] Warning: Can't parse date: " + date);
					}
				} else {
					System.err.println("[LeipzigInputFormat] Warning: Record is missing a date.");
				}
				value.setDate(parsedDate);
			} catch (Exception e) {
				System.err.println("[LeipzigInputFormat] Warning: Skipping record because an exception occured while parsing meta line " + line);
				System.err.println("File split: " + fileSplit);
				System.err.println("posInCharStream: " + posInCharStream);
				System.err.println("URL: " + value.getURL());
				System.err.println("[LeipzigInputFormat] Exception details: " + e.getMessage());
				if (skippedRecordCounter != null)
					skippedRecordCounter.increment(1);
				return false;
			}

			return true;
		}

		public static final String LF = System.getProperty("line.separator");

		private boolean hasNext() {
			//			System.out.println("hasNext: " + nextRecordLine + " != null && " + nextRecordStart + " < " + end);
			return nextRecordStart >= 0 && nextRecordStart < end;
		}

		@Override
		public boolean next(Text key, CrawlerRecord value) throws IOException {
			if (!hasNext())
				return false;

			skipToNextRecord(reader);

			// Try parsing meta line. If parsing failed, skip to next record, and so on.
			while (!parseMetaLine(value, currentRecordHeader)) {
				if (hasNext())
					skipToNextRecord(reader);
				else
					return false;
			}
			value.setContent(currentRecordContent);
			key.set(value.getURL());

			return true;
		}

		@Override
		public Text createKey() {
			return new Text();
		}

		@Override
		public CrawlerRecord createValue() {
			return new CrawlerRecord();
		}

		@Override
		public long getPos() throws IOException {
			//return countingIs.getCount();
			return posInByteStream;
		}

		@Override
		public void close() throws IOException {
			countingIs.close();
		}

		@Override
		public float getProgress() throws IOException {
			return ((float) (getPos() - start)) / ((float) (end - start));
		}

		/*
		 * ======================== ARC Logic ============================
		 */

		private final String UTF8_BOM = "\uFEFF";

		/**
		 * Reads from <code>input</code> until a valid record meta line was read. Everything
		 * else is added to <code>buffer</code>
		 */
		private boolean skipToNextRecord(BufferedReader input) throws IOException {
			StringBuffer recordBuffer = new StringBuffer();
			nextRecordStart = -1;
			// Continue with next record in case exception occurs
			String line;
			String recordHeaderFound = null;
			int newLineBytes = new String("\n").getBytes("UTF-8").length;
			boolean foundNewRecord = false;
			while ((line = reader.readLine()) != null) {
				// BOM fix (its use is discouraged, however it does appear sometimes)
				if (line.startsWith(UTF8_BOM))
					line = line.substring(1);

				if (line.startsWith("<source>")) {
					nextRecordStart = posInByteStream;
					foundNewRecord = true;
					recordHeaderFound = line;
				} else {
					recordBuffer.append(line + "\n");
				}
				long lineSizeBytes = line.getBytes("UTF-8").length + newLineBytes;
				posInByteStream += lineSizeBytes;
				if (foundNewRecord)
					break;
			}

			currentRecordContent = recordBuffer.toString();
			currentRecordHeader = nextRecordHeader;
			nextRecordHeader = recordHeaderFound;
			return foundNewRecord;
		}
	}
}