CRFExtractor.java example

Explorer
topic-modeling-master
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;


import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;

import cc.mallet.fst.CRF;
import cc.mallet.pipe.Noop;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.types.*;

/**
 * Created: Oct 12, 2004
 *
 * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A>
 * @version $Id: CRFExtractor.java,v 1.1 2007/10/22 21:37:44 mccallum Exp $
 */
public class CRFExtractor implements Extractor {

  private CRF crf;
  private Pipe tokenizationPipe;
  private Pipe featurePipe;
  private String backgroundTag;
  private TokenizationFilter filter;

  public CRFExtractor (CRF crf) {
    this (crf, new Noop ());
  }

  public CRFExtractor (File crfFile) throws IOException {
    this (loadCrf(crfFile), new Noop ());
  }

  public CRFExtractor (CRF crf, Pipe tokpipe) {
    this (crf, tokpipe, new BIOTokenizationFilter ());
  }

  public CRFExtractor (CRF crf, Pipe tokpipe, TokenizationFilter filter) {
    this (crf, tokpipe, filter, "O");
  }

  public CRFExtractor (CRF crf, Pipe tokpipe, TokenizationFilter filter, String backgroundTag) {
    this.crf = crf;
    tokenizationPipe = tokpipe;
    featurePipe = (Pipe) crf.getInputPipe ();
    this.filter = filter;
    this.backgroundTag = backgroundTag;
   }


  private static CRF loadCrf (File crfFile) throws IOException
  {
     ObjectInputStream ois = new ObjectInputStream( new FileInputStream( crfFile ) );
    CRF crf = null;

    // We shouldn't run into a ClassNotFound exception...
    try {
      crf = (CRF)ois.readObject();
    } catch (ClassNotFoundException e) {
      System.err.println ("Internal MALLET error: Could not read CRF from file "+crfFile+"\n"+e);
      e.printStackTrace ();
      throw new RuntimeException (e);
    }

    ois.close();
    return crf;
  }



  public Extraction extract (Object o)
  {
    // I don't think there's a polymorphic way to do this. b/c Java sucks. -cas
    if (o instanceof Tokenization) {
      return extract ((Tokenization) o);
    } 
    else if (o instanceof InstanceList) {
    	return extract ((InstanceList) o);
    }
    else  {
      return extract (doTokenize (o));
    }
  }


  private Tokenization doTokenize (Object obj)
  {
    Instance toked = new Instance (obj, null, null, null);
    tokenizationPipe.pipe (toked);
    return (Tokenization) toked.getData ();
  }


  public Extraction extract (Tokenization spans)
  {
    // We assume the input is unpiped.
    Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null));
    Sequence output = crf.transduce ((Sequence) carrier.getData ());
    Extraction extraction = new Extraction (this, getTargetAlphabet());
    DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), 
                                                        spans,
                                                        output, null, backgroundTag,
                                                        filter);
    extraction.addDocumentExtraction (docseq);
    return extraction;
  }

  public InstanceList pipeInstances (Iterator<Instance> source)
  {
    // I think that pipes should be associated neither with InstanceLists, nor
    //  with Instances. -cas
    InstanceList toked = new InstanceList (tokenizationPipe);
    toked.addThruPipe (source);
    InstanceList piped = new InstanceList (getFeaturePipe ());
    piped.addThruPipe (toked.iterator());
    return piped;
  }

	/** Assumes Instance.source contains the Tokenization object. */
	public Extraction extract (InstanceList ilist) {
    Extraction extraction = new Extraction (this, getTargetAlphabet ());
		for (int i = 0; i < ilist.size(); i++) {
			Instance inst = ilist.get(i);
			Tokenization tok = (Tokenization)inst.getSource();
      String name = inst.getName().toString();
      Sequence input = (Sequence)inst.getData ();
      Sequence target = (Sequence)inst.getTarget ();
      Sequence output = crf.transduce(input);
      DocumentExtraction docseq =
				new DocumentExtraction (name, getTargetAlphabet(), tok,
																output, target, backgroundTag,
																filter);
      extraction.addDocumentExtraction (docseq);			
		}
    return extraction;
	}
	
  public Extraction extract (Iterator<Instance> source)
  {
    Extraction extraction = new Extraction (this, getTargetAlphabet ());
    // Put all the instances through both pipes, then get viterbi path
    InstanceList tokedList = new InstanceList (tokenizationPipe);
    tokedList.addThruPipe (source);
    InstanceList pipedList = new InstanceList (getFeaturePipe ());
    pipedList.addThruPipe (tokedList.iterator());

    Iterator<Instance> it1 = tokedList.iterator ();
    Iterator<Instance> it2 = pipedList.iterator ();
    while (it1.hasNext()) {
      Instance toked = it1.next();
      Instance piped = it2.next ();
      Tokenization tok = (Tokenization) toked.getData();
      String name = piped.getName().toString();
      Sequence input = (Sequence) piped.getData ();
      Sequence target = (Sequence) piped.getTarget ();
      Sequence output = crf.transduce (input);

      DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok,
                                                          output, target, backgroundTag,
                                                          filter);
      extraction.addDocumentExtraction (docseq);
    }
    return extraction;
  }

  public TokenizationFilter getTokenizationFilter ()
  {
    return filter;
  }
	
  public String getBackgroundTag ()
  {
    return backgroundTag;
  }

  public Pipe getTokenizationPipe ()
  {
    return tokenizationPipe;
  }


  public void setTokenizationPipe (Pipe tokenizationPipe)
  {
    this.tokenizationPipe = tokenizationPipe;
  }


  public Pipe getFeaturePipe ()
  {
    return featurePipe;
  }

  //xxx This method is inherent dangerous!!! Should check that pipe.alphabet equals crf.alphabet
  public void setFeaturePipe (Pipe featurePipe)
  {
    this.featurePipe = featurePipe;
  }

  public Alphabet getInputAlphabet ()
  {
    return crf.getInputAlphabet ();
  }


  public LabelAlphabet getTargetAlphabet ()
  {
    return (LabelAlphabet) crf.getOutputAlphabet ();
  }


  public CRF getCrf ()
  {
    return crf;
  }

  /**
   * Transfer some Pipes from the feature pipe to the tokenization pipe.
   *  The feature pipe must be a SerialPipes.  This will destructively modify the CRF object of the extractor.
   *   This is useful if you have a CRF hat has been trained from a single pipe, which you need to split up
   *    int feature and tokenization pipes
   */
  public void slicePipes (int num)
  {
    Pipe fpipe = getFeaturePipe ();
    if (!(fpipe instanceof SerialPipes))
      throw new IllegalArgumentException ("slicePipes: FeaturePipe must be a SerialPipes.");
    SerialPipes sp = (SerialPipes) fpipe;
    ArrayList pipes = new ArrayList ();
    for (int i = 0; i < num; i++) {
      pipes.add (sp.getPipe (0));  
      //sp.removePipe (0); TODO Fix this
    }
    //setTokenizationPipe (sp);  TODO Fix this
  	throw new UnsupportedOperationException ("Not yet implemented...");
  }

  // Java serialization nonsense

  // Serial version 0:  Initial version
  // Serial version 1:  Add featurePipe
  // Serial version 2:  Add filter
  private static final int CURRENT_SERIAL_VERSION = 2;
  private static final long serialVersionUID = 1;

  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException
  {
    in.defaultReadObject ();
    int version = in.readInt ();
    if ((version == 0) || (featurePipe == null)) {
      featurePipe = (Pipe) crf.getInputPipe ();
    }
    if (version < 2) {
      filter = new BIOTokenizationFilter ();
    }
  }

  private void writeObject (ObjectOutputStream out) throws IOException
  {
    out.defaultWriteObject ();
    out.writeInt (CURRENT_SERIAL_VERSION);
  }


  public Sequence pipeInput (Object input)
  {
    InstanceList all = new InstanceList (getFeaturePipe ());
    all.add (input, null, null, null);
    return (Sequence) all.get (0).getData();
  }
}