package ruc.irm.classification;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import ruc.irm.similarity.sentence.SegmentProxy;
import ruc.irm.similarity.sentence.SegmentProxy.Word;
/**
* 代表一个文档实例
*
* @author xiatian
*
*/
public class Instance {
/** 文档类别 */
private String category;
/** 文档内容 */
private Set<String> bag = new HashSet<String>();
public Instance() {
}
public Instance(String category, File f, String encoding) {
this.category = category;
String line = null;
try {
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), encoding));
while ((line = in.readLine()) != null) {
System.out.println(line);
List<Word> words = SegmentProxy.segment(line);
for(Word w:words) {
if (w.getPos().endsWith("adj")
|| w.getPos().startsWith("n")
|| w.getPos().startsWith("v")) {
bag.add(w.getWord());
}
}
}
} catch (IOException e) {
System.out.println("current file:" + f.getAbsolutePath());
System.out.println("current line:" + line);
e.printStackTrace();
}
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public Set<String> getWords() {
return bag;
}
}