/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.tokenize.lang.en; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import opennlp.tools.tokenize.TokenSample; import opennlp.tools.util.Span; /** * Class which produces an Iterator<TokenSample> from a file of space delimited token. * This class uses a number of English-specific heuristics to un-separate tokens which * are typically found together in text. */ public class TokenSampleStream implements Iterator<TokenSample> { private BufferedReader in; private String line; private Pattern alphaNumeric = Pattern.compile("[A-Za-z0-9]"); private boolean evenq = true; public TokenSampleStream(InputStream is) throws IOException { this.in = new BufferedReader(new InputStreamReader(is)); line = in.readLine(); } public boolean hasNext() { return line != null; } public TokenSample next() { String[] tokens = line.split("\\s+"); if (tokens.length == 0) { evenq = true; } StringBuilder sb = new StringBuilder(line.length()); List<Span> spans = new ArrayList<>(); int length = 0; for (int ti = 0; ti < tokens.length; ti++) { String token = tokens[ti]; String lastToken = ti - 1 >= 0 ? tokens[ti - 1] : ""; switch (token) { case "-LRB-": token = "("; break; case "-LCB-": token = "{"; break; case "-RRB-": token = ")"; break; case "-RCB-": token = "}"; break; } if (sb.length() != 0) { if (!alphaNumeric.matcher(token).find() || token.startsWith("'") || token.equalsIgnoreCase("n't")) { if ((token.equals("``") || token.equals("--") || token.equals("$") || token.equals("(") || token.equals("&") || token.equals("#") || (token.equals("\"") && (evenq && ti != tokens.length - 1))) && (!lastToken.equals("(") || !lastToken.equals("{"))) { //System.out.print(" "+token); length++; } } else { if (!lastToken.equals("``") && (!lastToken.equals("\"") || evenq) && !lastToken.equals("(") && !lastToken.equals("{") && !lastToken.equals("$") && !lastToken.equals("#")) { length++; } } } if (token.equals("\"")) { evenq = ti == tokens.length - 1 || !evenq; } if (sb.length() < length) { sb.append(" "); } sb.append(token); spans.add(new Span(length, length + token.length())); length += token.length(); } try { line = in.readLine(); } catch (IOException e) { e.printStackTrace(); line = null; } return new TokenSample(sb.toString(),spans.toArray(new Span[spans.size()])); } public void remove() { throw new UnsupportedOperationException(); } private static void usage() { System.err.println("TokenSampleStream [-spans] < in"); System.err.println("Where in is a space delimited list of tokens."); } }