NameFinderCensus90NameStream.java example

Explorer
opennlp-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Locale;

import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.StringList;
import opennlp.tools.util.StringUtil;

/**
 * This class helps to read the US Census data from the files to build a
 * StringList for each dictionary entry in the name-finder dictionary.
 * The entries in the source file are as follows:
 * <p>
 *      SMITH          1.006  1.006      1
 * <ul>
 * <li>The first field is the name (in ALL CAPS).
 * <li>The next field is a frequency in percent.
 * <li>The next is a cumulative frequency in percent.
 * <li>The last is a ranking.
 * </ul>
 * <p>
 * <b>Note:</b> Do not use this class, internal use only!
 */
public class NameFinderCensus90NameStream implements ObjectStream<StringList> {

  private final Locale locale;
  private final Charset encoding;
  private final ObjectStream<String> lineStream;

  /**
   * This constructor takes an ObjectStream and initializes the class to handle
   * the stream.
   *
   * @param lineStream  an <code>ObjectSteam<String></code> that represents the
   *                    input file to be attached to this class.
   */
  public NameFinderCensus90NameStream(ObjectStream<String> lineStream) {
    this.locale = new Locale("en");   // locale is English
    this.encoding = Charset.defaultCharset();
    // todo how do we find the encoding for an already open ObjectStream() ?
    this.lineStream = lineStream;
  }

  /**
   * This constructor takes an <code>InputStream</code> and a <code>Charset</code>
   * and opens an associated stream object with the specified encoding specified.
   *
   * @param in  an <code>InputStreamFactory</code> for the input file.
   * @param encoding  the <code>Charset</code> to apply to the input stream.
   * @throws IOException
   */
  public NameFinderCensus90NameStream(InputStreamFactory in, Charset encoding)
      throws IOException {
    this.locale = new Locale("en"); // locale is English
    this.encoding = encoding;
    this.lineStream = new PlainTextByLineStream(in, this.encoding);
  }

  public StringList read() throws IOException {
    String line = lineStream.read();
    StringList name = null;

    if ((line != null) &&
        (!StringUtil.isEmpty(line))) {
      String name2;
      // find the location of the name separator in the line of data.
      int pos = line.indexOf(' ');
      if ((pos != -1)) {
        String parsed = line.substring(0, pos);
        // the data is in ALL CAPS ... so the easiest way is to convert
        // back to standard mixed case.
        if ((parsed.length() > 2) &&
            (parsed.startsWith("MC"))) {
          name2 = parsed.substring(0,1).toUpperCase(locale) +
                  parsed.substring(1,2).toLowerCase(locale) +
                  parsed.substring(2,3).toUpperCase(locale) +
                  parsed.substring(3).toLowerCase(locale);
        } else {
          name2 = parsed.substring(0,1).toUpperCase(locale) +
                  parsed.substring(1).toLowerCase(locale);
        }
        name = new StringList(new String[]{name2});
      }
    }

    return name;
  }

  public void reset() throws IOException, UnsupportedOperationException {
    lineStream.reset();
  }

  public void close() throws IOException {
    lineStream.close();
  }

}