package org.apache.lucene.analysis.jate; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import opennlp.tools.sentdetect.SentenceDetector; import opennlp.tools.util.Span; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.util.AttributeFactory; import org.apache.lucene.util.BytesRef; /** * Run OpenNLP SentenceDetector and Tokenizer. * Must have Sentence and/or Tokenizer. * <p>This class will split a text into sentences, the tokenize each sentence. For each token, it will record its sentence context information. See SentenceContext class. * The sentence context information is recorded as PayloadAttribute</p> */ public final class OpenNLPTokenizer extends Tokenizer implements SentenceContextAware { private static final int DEFAULT_BUFFER_SIZE = 256; private static final Logger LOG = Logger.getLogger(OpenNLPTokenizer.class.getName()); private int finalOffset; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PayloadAttribute tokenMetadataAtt = addAttribute(PayloadAttribute.class); // private Map<Integer, Paragraph> sentsInParagraph = new HashMap<>(); //the key is the startoffset of a sentence private Map<Paragraph, Integer> paragraphHasSents = new HashMap<>(); //key is the paragraph, value is the number of sentences in that paragraph private Map<Integer, Integer> sentIdsInParagraph = new HashMap<>(); //key is the setnecen startoffset, value is the sentence's id in its source paragraph private Span[] sentences = null; private Span[][] words = null; private Span[] wordSet = null; boolean first = true; int indexSentence = 0; int indexWord = 0; private char[] fullText; private ParagraphChunker paragraphOp = null; private SentenceDetector sentenceOp = null; private opennlp.tools.tokenize.Tokenizer tokenizerOp = null; public OpenNLPTokenizer(AttributeFactory factory, SentenceDetector sentenceOp, opennlp.tools.tokenize.Tokenizer tokenizerOp) { super(factory); termAtt.resizeBuffer(DEFAULT_BUFFER_SIZE); if (sentenceOp == null && tokenizerOp == null) { throw new IllegalArgumentException("OpenNLPTokenizer: need one or both of Sentence Detector and Tokenizer"); } this.sentenceOp = sentenceOp; this.tokenizerOp = tokenizerOp; } public OpenNLPTokenizer(AttributeFactory factory, SentenceDetector sentenceOp, opennlp.tools.tokenize.Tokenizer tokenizerOp, ParagraphChunker paragraphOp) { super(factory); termAtt.resizeBuffer(DEFAULT_BUFFER_SIZE); if (sentenceOp == null && tokenizerOp == null) { throw new IllegalArgumentException("OpenNLPTokenizer: need one or both of Sentence Detector and Tokenizer"); } this.sentenceOp = sentenceOp; this.tokenizerOp = tokenizerOp; this.paragraphOp = paragraphOp; } // OpenNLP ops run all-at-once. Have to cache sentence and/or word spans and feed them out. // Cache entire input buffer- don't know if this is the right implementation. // Of if the CharTermAttribute can cache it across multiple increments? @Override public final boolean incrementToken() throws IOException { if (first) { loadAll(); restartAtBeginning(); first = false; } if (sentences.length == 0) { first = true; return false; } int sentenceOffset = sentences[indexSentence].getStart(); if (wordSet == null) { wordSet = words[indexSentence]; } clearAttributes(); while (indexSentence < sentences.length) { while (indexWord == wordSet.length) { indexSentence++; if (indexSentence < sentences.length) { wordSet = words[indexSentence]; indexWord = 0; sentenceOffset = sentences[indexSentence].getStart(); } else { first = true; return false; } } // set termAtt from private buffer Span sentence = sentences[indexSentence]; Span word = wordSet[indexWord]; int spot = sentence.getStart() + word.getStart(); termAtt.setEmpty(); int termLength = word.getEnd() - word.getStart(); if (termAtt.buffer().length < termLength) { termAtt.resizeBuffer(termLength); } termAtt.setLength(termLength); char[] buffer = termAtt.buffer(); finalOffset = correctOffset(sentenceOffset + word.getEnd()); int start = correctOffset(word.getStart() + sentenceOffset); for (int i = 0; i < termLength; i++) { buffer[i] = fullText[spot + i]; } //safeguard tweak to avoid invalid token offsets, see issue 26 on github if (finalOffset - start > termLength) { offsetAtt.setOffset(start, start + termLength); LOG.warn("Invalid token start and end offsets diff greater than term length. End offset is reset to be start+tokenlength. " + "start=" + start + ", invalid end=" + finalOffset + ", termlength=" + termLength + ". See Issue 26 on JATE webpage"); /* String wordStr = new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset()); System.out.println(wordStr);*/ } else offsetAtt.setOffset(start, finalOffset); MWEMetadata ctx = addSentenceContext(new MWEMetadata(), indexWord, indexWord, null, indexSentence); if (paragraphOp != null) { Paragraph sourcePar = sentsInParagraph.get(sentence.getStart()); int sentenceIdInParagraph= sentIdsInParagraph.get(sentences[indexSentence].getStart()); addOtherMetadata(ctx, sourcePar.indexInDoc, paragraphHasSents.get(sourcePar), paragraphHasSents.size(), sentenceIdInParagraph, sentences.length); } addPayloadAttribute(tokenMetadataAtt, ctx); //System.out.println(tokenMetadataAtt.getPayload().utf8ToString()+","+new String(buffer,0, termAtt.length())); indexWord++; return true; } first = true; return false; } void restartAtBeginning() throws IOException { indexWord = 0; indexSentence = 0; indexWord = 0; finalOffset = 0; wordSet = null; } void loadAll() throws IOException { fillBuffer(); String txtStr = new String(fullText); detectSentences(txtStr); if (paragraphOp != null) { detectParagraphs(txtStr); } words = new Span[sentences.length][]; for (int i = 0; i < sentences.length; i++) { splitWords(i); } } void splitWords(int i) { Span current = sentences[i]; String sentence = String.copyValueOf(fullText, current.getStart(), current.getEnd() - current.getStart()); words[i] = tokenizerOp.tokenizePos(sentence); } // read all text, turn into sentences void detectSentences(String fulltext) throws IOException { //fullText.hashCode(); sentences = sentenceOp.sentPosDetect(fulltext); /* Span[] revised = new Span[sentences.length]; //correct offsets, in case charfilters have been used and will change offsets for(int i=0; i<sentences.length; i++){ Span span = sentences[i]; int newStart =correctOffset(span.getStart()); int newEnd = correctOffset(span.getEnd()); revised[i] = new Span(newStart, newEnd, span.getType(), span.getProb()); } sentences=revised;*/ } //split paragraphs and also create containment relation with sentences void detectParagraphs(String txtStr) { sentsInParagraph.clear(); paragraphHasSents.clear(); sentIdsInParagraph.clear(); List<Paragraph> paragraphs = paragraphOp.chunk(txtStr); if (paragraphs != null) { int parCursor = 0; Paragraph par = paragraphs.get(parCursor); int sentenceIdInPar=0; for (Span sent : sentences) { if (sent.getStart() >= par.startOffset && sent.getStart() <= par.endOffset) { sentsInParagraph.put(sent.getStart(), par); Integer c = paragraphHasSents.get(par); if (c == null) c = 0; c++; paragraphHasSents.put(par, c); sentIdsInParagraph.put(sent.getStart(), sentenceIdInPar); sentenceIdInPar++; } else { for (int i = parCursor + 1; i < paragraphs.size(); i++) { par = paragraphs.get(i); sentenceIdInPar=0; if (sent.getStart() >= par.startOffset && sent.getStart() <= par.endOffset) { sentsInParagraph.put(sent.getStart(), par); Integer c = paragraphHasSents.get(par); if (c == null) c = 0; c++; paragraphHasSents.put(par, c); parCursor = i; sentIdsInParagraph.put(sent.getStart(), sentenceIdInPar); sentenceIdInPar++; break; } } } } } } void fillBuffer() throws IOException { fullText = IOUtils.toCharArray(input); /*int offset = 0; int size = 10000; fullText = new char[size]; int length = input.read(fullText); while(length == size) { // fullText = IOUtils.toCharArray(input); fullText = Arrays.copyOf(fullText, offset + size); offset += size; length = input.read(fullText, offset, size); } fullText = Arrays.copyOf(fullText, offset + length);*/ } @Override public final void end() { // set final offset offsetAtt.setOffset(finalOffset, finalOffset); } // public void reset(Reader input) throws IOException { // super.reset(input); // fullText = null; // sentences = null; // words = null; // first = true; // } @Override public void reset() throws IOException { super.reset(); clearAttributes(); restartAtBeginning(); } public MWEMetadata addSentenceContext(MWEMetadata ctx, int firstTokenIndex, int lastTokenIndex, String posTag, int sentenceIndex) { ctx.addMetaData(MWEMetadataType.FIRST_COMPOSING_TOKEN_ID_IN_SENT, String.valueOf(firstTokenIndex)); ctx.addMetaData(MWEMetadataType.LAST_COMPOSING_TOKEN_ID_IN_SENT, String.valueOf(lastTokenIndex)); ctx.addMetaData(MWEMetadataType.POS, posTag); ctx.addMetaData(MWEMetadataType.SOURCE_SENTENCE_ID_IN_DOC, String.valueOf(sentenceIndex)); return ctx; } protected void addOtherMetadata(MWEMetadata ctx, int paragraphId, int totalSentencesInParagraph, int totalParagraphsInDoc, int sentenceIdInParagraph, int totalSentencesInDoc) { ctx.addMetaData(MWEMetadataType.SOURCE_PARAGRAPH_ID_IN_DOC, String.valueOf(paragraphId)); ctx.addMetaData(MWEMetadataType.SENTENCES_IN_PARAGRAPH, String.valueOf(totalSentencesInParagraph)); ctx.addMetaData(MWEMetadataType.PARAGRAPHS_IN_DOC, String.valueOf(totalParagraphsInDoc)); ctx.addMetaData(MWEMetadataType.SOURCE_SENTENCE_ID_IN_PARAGRAPH, String.valueOf(sentenceIdInParagraph)); ctx.addMetaData(MWEMetadataType.SENTENCES_IN_DOC, String.valueOf(totalSentencesInDoc)); } public void addPayloadAttribute(PayloadAttribute attribute, MWEMetadata ctx) { String data = MWEMetadata.serialize(ctx); attribute.setPayload(new BytesRef(data)); } }