/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.transformation; import net.sourceforge.chaperon.build.LexicalAutomatonBuilder; import net.sourceforge.chaperon.common.Decoder; import net.sourceforge.chaperon.model.lexicon.Lexicon; import net.sourceforge.chaperon.model.lexicon.LexiconFactory; import net.sourceforge.chaperon.process.LexicalAutomaton; import net.sourceforge.chaperon.process.PatternProcessor; import org.apache.avalon.excalibur.pool.Recyclable; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.logger.LogEnabled; import org.apache.avalon.framework.logger.Logger; import org.apache.avalon.framework.parameters.ParameterException; import org.apache.avalon.framework.parameters.Parameterizable; import org.apache.avalon.framework.parameters.Parameters; import org.apache.avalon.framework.service.ServiceException; import org.apache.avalon.framework.service.ServiceManager; import org.apache.avalon.framework.service.Serviceable; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.xml.XMLUtils; import org.apache.cocoon.caching.CacheableProcessingComponent; import org.apache.cocoon.components.source.SourceUtil; import org.apache.cocoon.environment.SourceResolver; //import org.apache.commons.logging.impl.AvalonLogger; import org.apache.excalibur.source.Source; import org.apache.excalibur.source.SourceException; import org.apache.excalibur.source.SourceValidity; import org.apache.excalibur.store.Store; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import java.io.IOException; import java.io.Serializable; import java.util.Map; /** * This transfomer transforms text pattern of a XML file into lexemes by using a lexicon file. * * <p> * Input: * </p> * <pre> * <section> * Text 123 bla * </section> * </pre> * * <p> * can be transformed into the following output: * </p> * <pre> * <section> * Text * <lexeme symbol="number" text="123"/> * bla * </section> * </pre> * * @author <a href="mailto:stephan@apache.org">Stephan Michels</a> * @version $Id$ */ public class PatternTransformer extends AbstractTransformer implements LogEnabled, Serviceable, Recyclable, Disposable, Parameterizable, CacheableProcessingComponent { /** Namespace for the SAX events. */ public static final String NS = "http://chaperon.sourceforge.net/schema/lexemes/2.0"; private String lexicon = null; private Source lexiconSource = null; private Logger logger = null; private ServiceManager manager = null; private SourceResolver resolver = null; private LexicalAutomaton automaton = null; private PatternProcessor processor = new PatternProcessor(); private boolean groups = false; private StringBuffer buffer = new StringBuffer(); private StringBuffer output = new StringBuffer(); /** * Provide component with a logger. * * @param logger the logger */ public void enableLogging(Logger logger) { this.logger = logger; } /** * Pass the ServiceManager to the Serviceable. The Serviceable implementation should use the * specified ServiceManager to acquire the services it needs for execution. * * @param manager The ServiceManager which this Serviceable uses. */ public void service(ServiceManager manager) { this.manager = manager; } /** * Provide component with parameters. * * @param parameters the parameters * * @throws ParameterException if parameters are invalid */ public void parameterize(Parameters parameters) throws ParameterException { groups = parameters.getParameterAsBoolean("groups", false); } /** * Set the SourceResolver, objectModel Map, the source and sitemap Parameters used to process the * request. * * @param resolver Source resolver * @param objectmodel Object model * @param src Source * @param parameters Parameters * * @throws IOException * @throws ProcessingException * @throws SAXException */ public void setup(SourceResolver resolver, Map objectmodel, String src, Parameters parameters) throws ProcessingException, SAXException, IOException { this.resolver = resolver; Store store = null; try { this.lexicon = src; this.lexiconSource = resolver.resolveURI(this.lexicon); // Retrieve the lexical automaton from the transient store store = (Store)this.manager.lookup(Store.TRANSIENT_STORE); LexicalAutomatonEntry entry = (LexicalAutomatonEntry)store.get(this.lexiconSource.getURI()); // If the lexicon has changed, rebuild the lexical automaton if ((entry==null) || (entry.getValidity()==null) || (entry.getValidity().isValid(this.lexiconSource.getValidity())<=0)) { this.logger.info("(Re)building the automaton from '"+this.lexiconSource.getURI()+"'"); if (this.lexiconSource.getInputStream()==null) throw new ProcessingException("Source '"+this.lexiconSource.getURI()+"' not found"); LexiconFactory factory = new LexiconFactory(); SourceUtil.toSAX(this.manager, this.lexiconSource, null, factory); Lexicon lexicon = factory.getLexicon(); LexicalAutomatonBuilder builder = new LexicalAutomatonBuilder(lexicon/*, new AvalonLogger(this.logger)*/); this.automaton = builder.getLexicalAutomaton(); this.logger.info("Store automaton into store for '"+this.lexiconSource.getURI()+"'"); store.store(this.lexiconSource.getURI(), new LexicalAutomatonEntry(this.automaton, this.lexiconSource.getValidity())); } else { this.logger.info("Getting automaton from store for '"+this.lexiconSource.getURI()+"'"); this.automaton = entry.getLexicalAutomaton(); } } catch (SourceException se) { throw new ProcessingException("Error during resolving of '"+src+"'.", se); } catch (ServiceException se) { throw new ProcessingException("Could not lookup for component", se); } finally { if (store!=null) this.manager.release(store); } } /** * Generate the unique key. This key must be unique inside the space of this component. * * @return The generated key hashes the src */ public Serializable getKey() { return this.lexiconSource.getURI(); } /** * Generate the validity object. * * @return The generated validity object or <code>null</code> if the component is currently not * cacheable. */ public SourceValidity getValidity() { return this.lexiconSource.getValidity(); } /** * Recycle this component. All instance variables are set to <code>null</code>. */ public void recycle() { if ((this.resolver!=null) && (this.lexiconSource!=null)) { this.resolver.release(this.lexiconSource); this.lexiconSource = null; } this.automaton = null; super.recycle(); } /** * The dispose operation is called at the end of a components lifecycle. */ public void dispose() { if ((this.resolver!=null) && (this.lexiconSource!=null)) { this.resolver.release(this.lexiconSource); this.lexiconSource = null; } this.manager = null; } /** * Receive notification of the beginning of an element. * * @param uri The Namespace URI, or the empty string if the element has no Namespace URI or if * Namespace processing is not being performed. * @param loc The local name (without prefix), or the empty string if Namespace processing is not * being performed. * @param raw The raw XML 1.0 name (with prefix), or the empty string if raw names are not * available. * @param a The attributes attached to the element. If there are no attributes, it shall be an * empty Attributes object. * * @throws SAXException */ public void startElement(String uri, String loc, String raw, Attributes a) throws SAXException { search(); if (contentHandler!=null) contentHandler.startElement(uri, loc, raw, a); } /** * Receive notification of the end of an element. * * @param uri The Namespace URI, or the empty string if the element has no Namespace URI or if * Namespace processing is not being performed. * @param loc The local name (without prefix), or the empty string if Namespace processing is not * being performed. * @param raw The raw XML 1.0 name (with prefix), or the empty string if raw names are not * available. * * @throws SAXException */ public void endElement(String uri, String loc, String raw) throws SAXException { search(); if (contentHandler!=null) contentHandler.endElement(uri, loc, raw); } /** * Receive notification of character data. * * @param c The characters from the XML document. * @param start The start position in the array. * @param len The number of characters to read from the array. * * @throws SAXException */ public void characters(char[] c, int start, int len) throws SAXException { buffer.append(c, start, len); } /** * Receive notification of ignorable whitespace in element content. * * @param c The characters from the XML document. * @param start The start position in the array. * @param len The number of characters to read from the array. * * @throws SAXException */ public void ignorableWhitespace(char[] c, int start, int len) throws SAXException { buffer.append(c, start, len); } /** * Receive notification of a processing instruction. * * @param target The processing instruction target. * @param data The processing instruction data, or null if none was supplied. * * @throws SAXException */ public void processingInstruction(String target, String data) throws SAXException { search(); if (contentHandler!=null) contentHandler.processingInstruction(target, data); } /** * Report an XML comment anywhere in the document. * * @param ch An array holding the characters in the comment. * @param start The starting position in the array. * @param len The number of characters to use from the array. * * @throws SAXException */ public void comment(char[] ch, int start, int len) throws SAXException { search(); if (lexicalHandler!=null) lexicalHandler.comment(ch, start, len); } /** * @throws SAXException */ private void search() throws SAXException { if (buffer.length()<=0) return; char[] text = buffer.toString().toCharArray(); String lexemesymbol; String lexemetext; String[] groups = null; int lexemeindex = 0; int position = 0; output.setLength(0); do { lexemesymbol = null; lexemetext = null; for (lexemeindex = automaton.getLexemeCount()-1; lexemeindex>=0; lexemeindex--) { processor.setPatternAutomaton(automaton.getLexemeDefinition(lexemeindex)); if ((processor.match(text, position)) && ((lexemetext==null) || (processor.getGroup().length()>=lexemetext.length()))) { lexemesymbol = automaton.getLexemeSymbol(lexemeindex); lexemetext = processor.getGroup(); if (this.groups) { groups = new String[processor.getGroupCount()]; for (int group = 0; group<processor.getGroupCount(); group++) groups[group] = processor.getGroup(group); } } } if ((lexemetext!=null) && (lexemetext.length()>0)) { if (lexemesymbol!=null) { if (logger!=null) logger.debug("Recognize token "+lexemesymbol+" with "+Decoder.toString(lexemetext)); if (output.length()>0) contentHandler.characters(output.toString().toCharArray(), 0, output.length()); output.setLength(0); contentHandler.startPrefixMapping("", NS); AttributesImpl atts = new AttributesImpl(); atts.addAttribute("", "symbol", "symbol", "CDATA", lexemesymbol); atts.addAttribute("", "text", "text", "CDATA", lexemetext); contentHandler.startElement(NS, "lexeme", "lexeme", atts); if (this.groups) { for (int group = 0; group<groups.length; group++) { contentHandler.startElement(NS, "group", "group", XMLUtils.EMPTY_ATTRIBUTES); contentHandler.characters(groups[group].toCharArray(), 0, groups[group].length()); contentHandler.endElement(NS, "group", "group"); } } contentHandler.endElement(NS, "lexeme", "lexeme"); contentHandler.endPrefixMapping(""); } else if (logger!=null) logger.debug("Ignore lexeme with "+Decoder.toString(lexemetext)); position += lexemetext.length(); } else { output.append(text[position]); position++; } } while (position<text.length); if (output.length()>0) contentHandler.characters(output.toString().toCharArray(), 0, output.length()); buffer.setLength(0); } /** * This class represent a entry in a store to cache the lexical automaton. */ public static class LexicalAutomatonEntry implements Serializable { private SourceValidity validity = null; private LexicalAutomaton automaton = null; /** * Create a new entry. * * @param automaton Lexical automaton. * @param validity Validity of the lexicon file. */ public LexicalAutomatonEntry(LexicalAutomaton automaton, SourceValidity validity) { this.automaton = automaton; this.validity = validity; } /** * Return the validity of the lexicon file. * * @return Validity of the lexicon file. */ public SourceValidity getValidity() { return this.validity; } /** * Return the lexical automaton. * * @return Lexical automaton. */ public LexicalAutomaton getLexicalAutomaton() { return this.automaton; } private void writeObject(java.io.ObjectOutputStream out) throws IOException { out.writeObject(validity); out.writeObject(automaton); } private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { validity = (SourceValidity)in.readObject(); automaton = (LexicalAutomaton)in.readObject(); } } }