package org.apache.solr.handler; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.commons.io.IOUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.Collection; /** * * @deprecated Use {@link org.apache.solr.handler.DocumentAnalysisRequestHandler} instead. **/ public class AnalysisRequestHandler extends RequestHandlerBase { public static Logger log = LoggerFactory.getLogger(AnalysisRequestHandler.class); private XMLInputFactory inputFactory; @Override public void init(NamedList args) { super.init(args); inputFactory = XMLInputFactory.newInstance(); try { // The java 1.6 bundled stax parser (sjsxp) does not currently have a thread-safe // XMLInputFactory, as that implementation tries to cache and reuse the // XMLStreamReader. Setting the parser-specific "reuse-instance" property to false // prevents this. // All other known open-source stax parsers (and the bea ref impl) // have thread-safe factories. inputFactory.setProperty("reuse-instance", Boolean.FALSE); } catch (IllegalArgumentException ex) { // Other implementations will likely throw this exception since "reuse-instance" // isimplementation specific. log.debug("Unable to set the 'reuse-instance' property for the input factory: " + inputFactory); } } public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { SolrParams params = req.getParams(); Iterable<ContentStream> streams = req.getContentStreams(); if (streams != null) { for (ContentStream stream : req.getContentStreams()) { Reader reader = stream.getReader(); try { XMLStreamReader parser = inputFactory.createXMLStreamReader(reader); NamedList<Object> result = processContent(parser, req.getSchema()); rsp.add("response", result); } finally { IOUtils.closeQuietly(reader); } } } } NamedList<Object> processContent(XMLStreamReader parser, IndexSchema schema) throws XMLStreamException, IOException { NamedList<Object> result = new SimpleOrderedMap<Object>(); while (true) { int event = parser.next(); switch (event) { case XMLStreamConstants.END_DOCUMENT: { parser.close(); return result; } case XMLStreamConstants.START_ELEMENT: { String currTag = parser.getLocalName(); if ("doc".equals(currTag)) { log.trace("Tokenizing doc..."); SolrInputDocument doc = readDoc(parser); SchemaField uniq = schema.getUniqueKeyField(); NamedList<NamedList<NamedList<Object>>> theTokens = new SimpleOrderedMap<NamedList<NamedList<Object>>>(); result.add(doc.getFieldValue(uniq.getName()).toString(), theTokens); for (String name : doc.getFieldNames()) { FieldType ft = schema.getFieldType(name); Analyzer analyzer = ft.getAnalyzer(); Collection<Object> vals = doc.getFieldValues(name); for (Object val : vals) { Reader reader = new StringReader(val.toString()); TokenStream tstream = analyzer.tokenStream(name, reader); NamedList<NamedList<Object>> tokens = getTokens(tstream); theTokens.add(name, tokens); } } } break; } } } } static NamedList<NamedList<Object>> getTokens(TokenStream tstream) throws IOException { // outer is namedList since order of tokens is important NamedList<NamedList<Object>> tokens = new NamedList<NamedList<Object>>(); // TODO: support custom attributes CharTermAttribute termAtt = null; TermToBytesRefAttribute bytesAtt = null; if (tstream.hasAttribute(CharTermAttribute.class)) { termAtt = tstream.getAttribute(CharTermAttribute.class); } else if (tstream.hasAttribute(TermToBytesRefAttribute.class)) { bytesAtt = tstream.getAttribute(TermToBytesRefAttribute.class); } final OffsetAttribute offsetAtt = tstream.addAttribute(OffsetAttribute.class); final TypeAttribute typeAtt = tstream.addAttribute(TypeAttribute.class); final PositionIncrementAttribute posIncAtt = tstream.addAttribute(PositionIncrementAttribute.class); final BytesRef bytes = new BytesRef(); while (tstream.incrementToken()) { NamedList<Object> token = new SimpleOrderedMap<Object>(); tokens.add("token", token); if (termAtt != null) { token.add("value", termAtt.toString()); } if (bytesAtt != null) { bytesAtt.toBytesRef(bytes); // TODO: This is incorrect when numeric fields change in later lucene versions. It should use BytesRef directly! token.add("value", bytes.utf8ToString()); } token.add("start", offsetAtt.startOffset()); token.add("end", offsetAtt.endOffset()); token.add("posInc", posIncAtt.getPositionIncrement()); token.add("type", typeAtt.type()); //TODO: handle payloads } return tokens; } SolrInputDocument readDoc(XMLStreamReader parser) throws XMLStreamException { SolrInputDocument doc = new SolrInputDocument(); StringBuilder text = new StringBuilder(); String name = null; String attrName = ""; float boost = 1.0f; boolean isNull = false; while (true) { int event = parser.next(); switch (event) { // Add everything to the text case XMLStreamConstants.SPACE: case XMLStreamConstants.CDATA: case XMLStreamConstants.CHARACTERS: text.append(parser.getText()); break; case XMLStreamConstants.END_ELEMENT: if ("doc".equals(parser.getLocalName())) { return doc; } else if ("field".equals(parser.getLocalName())) { if (!isNull) { doc.addField(name, text.toString(), boost); boost = 1.0f; } } break; case XMLStreamConstants.START_ELEMENT: text.setLength(0); String localName = parser.getLocalName(); if (!"field".equals(localName)) { log.warn("unexpected XML tag doc/" + localName); throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "unexpected XML tag doc/" + localName); } String attrVal = ""; for (int i = 0; i < parser.getAttributeCount(); i++) { attrName = parser.getAttributeLocalName(i); attrVal = parser.getAttributeValue(i); if ("name".equals(attrName)) { name = attrVal; } } break; } } } //////////////////////// SolrInfoMBeans methods ////////////////////// @Override public String getDescription() { return "Provide Analysis of text"; } @Override public String getVersion() { return "$Revision:$"; } @Override public String getSourceId() { return "$Id:$"; } @Override public String getSource() { return "$URL:$"; } }