/* * * Copyright 2013 LinkedIn Corp. All rights reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ package com.linkedin.databus2.ggParser.staxparser.validator; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.io.RandomAccessFile; import java.io.SequenceInputStream; import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import javax.xml.stream.FactoryConfigurationError; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.apache.log4j.Logger; import com.linkedin.databus.core.ConcurrentAppendableCompositeFileInputStream; import com.linkedin.databus2.ggParser.XmlStateMachine.ColumnState; import com.linkedin.databus2.ggParser.XmlStateMachine.DbUpdateState; import com.linkedin.databus2.ggParser.XmlStateMachine.TokenState; /** * A parser that reads XMLFORMAT GoldenGate trail file and validates it. */ public class XmlFormatTrailParser implements Runnable { private static final long POSITION_REPORT_GAP = 10L * 1024L * 1024; private static final long POSITION_REPORT_TIME_MS = 30 * 1000; private static final int ERROR_CONTEXT_LEN = 100; private static final String TOKEN_XID = "TK-XID"; public static final String DTD = "<!DOCTYPE root [ \n" + "<!ELEMENT root (transaction)*> \n" + "<!ELEMENT transaction (dbupdate)*> \n" + "<!ATTLIST transaction \n" + " timestamp CDATA #REQUIRED \n" + ">\n" + "<!ELEMENT dbupdate (columns, tokens)> \n" + "<!ATTLIST dbupdate \n" + " table CDATA #REQUIRED \n" + " type (insert|delete|update) #REQUIRED \n" + ">\n" + "<!ELEMENT columns (column)+>\n" + "<!ELEMENT column (#PCDATA)>\n" + "<!ATTLIST column \n" + " name CDATA #REQUIRED \n" + " key (true) #IMPLIED \n" + " status CDATA #IMPLIED \n" + ">\n" + "<!ELEMENT tokens (token)+>\n" + "<!ELEMENT token (#PCDATA)>\n" + "<!ATTLIST token \n" + " name CDATA #REQUIRED \n" + ">\n" + "]>\n"; private static enum DbupdateType { INSERT, DELETE, UPDATE }; private final Logger _log; /** The STAX XML reader factory*/ private final XMLInputFactory _xmlInputFactory; /** The STAX XML reader */ private final XMLStreamReader _xmlStreamReader; /** The input stream that coaslesces all trail files into a single stream*/ private final ConcurrentAppendableCompositeFileInputStream _inputStream; /** A hack to add missing XML root element in the trails */ private final InputStream _realInputStream; /** A flag to shutdown the parsing */ private final AtomicBoolean _shutdownRequested = new AtomicBoolean(false); /** The last observed error */ private Throwable _lastError = null; /** The currently processed trail file name */ private String _lastFileName = null; /** The offset of the last block of data read by the XML reader because it does * internal buffering */ private long _lastPosition = 0; /** The state of the parser */ private String _currentTableName; /** A map from dbname.tablename to a set of column names we've seen for that table */ private Map<String, Set<String>> _tableColumns = new HashMap<String, Set<String>>(); /** The set of columns we've seen for the current table */ private Set<String> _curTableColumns = new HashSet<String>(); /** The set of GG tokens we've seen for the current table */ private Set<String> _curTableTokens = new HashSet<String>(); /** The type of the current <dbupdate> */ private DbupdateType _dbupdateType; private final boolean _continueOnError; private final PrintStream _errOut; private long _errorCount = 0; public XmlFormatTrailParser(ConcurrentAppendableCompositeFileInputStream inputStream, boolean validating, Logger log, String errorLogFile) throws XMLStreamException, IOException { super(); _log = (null == log) ? Logger.getLogger(XmlFormatTrailParser.class) : log; _inputStream = inputStream; _realInputStream = wrapStreamWithXmlTags(_inputStream); _xmlInputFactory = createXmlInputFactory(validating); _xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); _xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.TRUE); _xmlStreamReader = _xmlInputFactory.createXMLStreamReader(_realInputStream); _continueOnError = null != errorLogFile; if (_continueOnError) { _errOut = new PrintStream(errorLogFile); } else { _errOut = null; } } private XMLInputFactory createXmlInputFactory(boolean validating) throws FactoryConfigurationError { XMLInputFactory result = null; Throwable createError = null; try { @SuppressWarnings("unchecked") Class<XMLInputFactory> woodstoxFactory = (Class<XMLInputFactory>)Class.forName("com.ctc.wstx.stax.WstxInputFactory"); result = woodstoxFactory.newInstance(); if (validating) { _log.info("found woodstox library: DTD validation will be enabled"); result.setProperty(XMLInputFactory.IS_VALIDATING, validating); } } catch (ClassNotFoundException e) { createError = e; } catch (InstantiationException e) { createError = e; } catch (IllegalAccessException e) { createError = e; } catch (RuntimeException e) { createError = e; } if (null != createError) { _log.info("unable to find woodstox library, defaulting to Java: " + createError); if (validating) { _log.warn("default implementation does not support DTD validation"); } result = XMLInputFactory.newInstance(); } return result; } /** Make the trail files input look like real XML */ private InputStream wrapStreamWithXmlTags(InputStream compositeInputStream) { String xmlStart = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n" + DTD + "\n<root>"; String xmlEnd = "</root>"; _log.info("The xml start tag used is:" + xmlStart); List<InputStream> xmlTagsList = Arrays.asList(new InputStream[] { new ByteArrayInputStream(xmlStart.getBytes(Charset.forName("ISO-8859-1"))), compositeInputStream, new ByteArrayInputStream(xmlEnd.getBytes(Charset.forName("ISO-8859-1"))), }); Enumeration<InputStream> streams = Collections.enumeration(xmlTagsList); SequenceInputStream seqStream = new SequenceInputStream(streams); return seqStream; } public void shutdownAsyncronously() { _shutdownRequested.set(true); } /** * @see java.lang.Runnable#run() */ @Override public void run() { long lastReportTs = 0; long lastReportedPosition = _lastPosition; long savePos = _lastPosition; try { while (! _shutdownRequested.get() && _xmlStreamReader.hasNext()) { int eventType = _xmlStreamReader.next(); switch (eventType) { case XMLStreamConstants.START_ELEMENT: processStartElement(); break; case XMLStreamConstants.END_ELEMENT: processEndElement(); break; // case XMLStreamConstants.ATTRIBUTE: processAttribute(); break; default: break; //do nothing -- just log progress } File curFile = _inputStream.getCurrentFile(); if (null != curFile && ! _inputStream.isClosed()) { String curFileName = curFile.getName(); long currentTs = System.currentTimeMillis(); long curPos = _inputStream.getCurrentPosition(); if (curPos != savePos) { //the XML reader seems to do internal buffering which makes it harder to //guess the position where a problem happened. //At any point the parser is processing between the bytes [_lastPosition, curPos). _lastPosition = savePos; savePos = curPos; } if (! curFileName.equals(_lastFileName) || (curPos - lastReportedPosition) >= POSITION_REPORT_GAP || (currentTs - lastReportTs) >= POSITION_REPORT_TIME_MS) { lastReportedPosition = curPos; lastReportTs = currentTs; _log.info("file: " + curFileName + "; pos: " + curPos); logTablesSeen(); } _lastFileName = curFileName; } } } catch (XMLStreamException e) { _log.error("xml stream error: " + e, e); _lastError = e; } catch (RuntimeException e) { _log.error("runtime error: " + e, e); _lastError = e; } } private void processError(String msg, Throwable e) { ++_errorCount; if (_continueOnError) { _log.error("PARSE ERROR: " + msg); _errOut.println("========================================="); _errOut.println(String.format("PARSE ERROR %s", msg) ); try { printErrorContext(_errOut); } catch (IOException e1) { _errOut.println("I/O error: " + e1); } _errOut.println("========================================="); } else { if (null != e) { throw new RuntimeException(msg); } else { throw new RuntimeException(msg, e); } } } /** In case an error print out the trail file context. Because of the * XML reader buffering, we can only get the block (8K). It has to be * visually inspected */ protected void printErrorContext(PrintStream errOut) throws IOException { final File file = _inputStream.getCurrentFile(); final long position = _inputStream.getCurrentPosition(); File lastFile = new File(file.getParentFile(), getLastFileName()); errOut.println("error between " + lastFile + " @ " + getLastPosition() + " and " + _inputStream.getCurrentFile() + " @ " + _inputStream.getCurrentPosition()); RandomAccessFile f = new RandomAccessFile(file, "r"); try { long startPos = lastFile.equals(file) ? getLastPosition() : 0; long endPos = position + ERROR_CONTEXT_LEN; int contextSize = (int)(endPos - startPos); byte[] context = new byte[contextSize]; f.seek(startPos); if (f.read(context, 0, contextSize) > 0) { errOut.println("context: " + new String(context, "ISO-8859-1")); } else { errOut.println("unable to read XML error context"); } } finally { f.close(); } } /** * Logs the currently discovered tables */ private void logTablesSeen() { if (_log.isInfoEnabled()) { _log.info("Discovered tables:" + _tableColumns); } } /** * Processes a GG token for the current table update */ private void processToken(String tokenName) { _curTableTokens.add(tokenName); if (_log.isDebugEnabled()) { _log.debug("added token " + tokenName + " for table " + _currentTableName); } } /** * Processes a new column for the current table update */ private void processColumn(String columnName, boolean isKey) { if (isKey) { _curTableColumns.add("*" + columnName + "*"); } else { _curTableColumns.add(columnName); } if (_log.isDebugEnabled()) { _log.debug("added column " + columnName + " for table " + _currentTableName); } } /** * */ private void processEndElement() { final String elemName = _xmlStreamReader.getLocalName(); if (ColumnState.COLUMNSTATE.equals(elemName)) { endColumnElement(); } else if (TokenState.TOKENSTATE.equals(elemName)) { endTokenElement(); } else if (DbUpdateState.DBUPDATE.equals(elemName)) { endDbupdateElement(); } } /** * End processing a <dbupdate> element */ private void endDbupdateElement() { validateTokens(); validateColumns(); _dbupdateType = null; } /** * Validate the columns we've seen for a table update: * (1) GG-specific columns are present * * <p>The method also keeps track of all columns we've seen for a table */ private void validateColumns() { if (_log.isDebugEnabled()) { _log.debug("validating columns " + _curTableColumns + " for table " + _currentTableName); } /** GG_STATUS and GG_MODI_TS may or may not be present for delete operations */ if (DbupdateType.DELETE != _dbupdateType) { if (! _curTableColumns.contains("GG_STATUS")) { processError("missing GG_STATUS column for table " + _currentTableName + " update type:" + _dbupdateType, null); } if (! _curTableColumns.contains("GG_MODI_TS")) { processError("missing column GG_MODI_TS for table " + _currentTableName + " update type:" + _dbupdateType, null); } } if (_tableColumns.containsKey(_currentTableName)) { Set<String> cols = _tableColumns.get(_currentTableName); cols.addAll(_curTableColumns); } else { _tableColumns.put(_currentTableName, new HashSet<String>(_curTableColumns)); } } /** * Validates the token for the table update: * (1) make sure that the transaction id is there * (2) make sure that the SCN is there */ private void validateTokens() { if (_log.isDebugEnabled()) { _log.debug("validating tokens " + _curTableTokens + " for table " + _currentTableName); } if (! _curTableTokens.contains(TOKEN_XID)) { processError("missing transaction id TK-XID for table " + _currentTableName, null); } if (! _curTableTokens.contains(TokenState.TOKENSCN)) { processError("missing SCN TK-CSN for table " + _currentTableName, null); } } /** * End processing a <token> element */ private void endTokenElement() { } /** * End processing a <column> element */ private void endColumnElement() { } /** Process the start of an XML element - see if it is a element we care about */ private void processStartElement() { final String elemName = _xmlStreamReader.getLocalName(); if (ColumnState.COLUMNSTATE.equals(elemName)) { startColumnElement(); } else if (TokenState.TOKENSTATE.equals(elemName)) { startTokenElement(); } else if (DbUpdateState.DBUPDATE.equals(elemName)) { startDbupdateElement(); } } /** * Start processing a <dbupdate> element */ private void startDbupdateElement() { _curTableColumns.clear(); _curTableTokens.clear(); _currentTableName = _xmlStreamReader.getAttributeValue(null, DbUpdateState.TABLEATTR); String dbupdateTypeStr = _xmlStreamReader.getAttributeValue(null, DbUpdateState.UPDATEATTRNAME); if (null == dbupdateTypeStr) { processError("missing type for <dbupdate> element for table " + _currentTableName, null); } try { _dbupdateType = DbupdateType.valueOf(dbupdateTypeStr.toUpperCase()); } catch (IllegalArgumentException e) { processError("unknown <dbupdate> type:" + dbupdateTypeStr, null); } } /** * Start processing a <token> element */ private void startTokenElement() { processToken(_xmlStreamReader.getAttributeValue(null, TokenState.TOKENATTRNAME)); } /** * Start processing a <column> element */ private void startColumnElement() { processColumn(_xmlStreamReader.getAttributeValue(null, ColumnState.FIELDNAMEATTR), null != _xmlStreamReader.getAttributeValue(null, ColumnState.KEYNAMEATTR)); } public Logger getLog() { return _log; } /** * Last parsing or validation error */ public Throwable getLastError() { return _lastError; } /** * @return the lastPosition */ public long getLastPosition() { return _lastPosition; } /** * @return the lastFileName */ public String getLastFileName() { return _lastFileName; } /** * @return the errorCount */ public long getErrorCount() { return _errorCount; } }