package hu.u_szeged.kpe.main; import hu.u_szeged.kpe.readers.DocumentData; import hu.u_szeged.kpe.readers.DocumentSet; import hu.u_szeged.utils.ClassificationInstance; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.ObjectInputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collection; import java.util.List; /** * The class that executes keyphrase extraction itself. */ public class KeyPhraseExtractor { private KPEFilter m_KPEFilter = null; private boolean m_prune; /** The number of phrases to extract. */ private int m_numPhrases = 10; /** The m_reader for the processing of the test data */ private DocumentSet docSet; /** * Flag indicating whether potentially synonymous words should be treated specially when determining class labels */ public KeyPhraseExtractor() { } public void setDocSet(DocumentSet docs) { docSet = docs; } public DocumentSet getDocSet() { return docSet; } /** * Get the value of numPhrases. * * @return Value of numPhrases. */ public int getNumPhrases() { return m_numPhrases; } /** * Set the value of numPhrases. * * @param newnumPhrases * Value to assign to numPhrases. */ public void setNumPhrases(int newnumPhrases) { m_numPhrases = newnumPhrases; } public void setPrune(boolean prune) { m_prune = prune; } public boolean getPruning() { return m_prune; } public void loadModel(String fileName) throws Exception { ObjectInputStream in = new ObjectInputStream(new BufferedInputStream(new FileInputStream(fileName))); m_KPEFilter = (KPEFilter) in.readObject(); in.close(); } public void setKPEFilter(KPEFilter kf) { m_KPEFilter = kf; } public KPEFilter getKpeFilter() { return m_KPEFilter; } /** * @param out * @param prunedPhrases * @param doc */ private String writeOutKeyphrases(PrintWriter out, List<int[]> length, List<ClassificationInstance> prunedPhrases, DocumentData doc) { out.print("\n\n----------" + doc.getFile() + "_" + doc.getLineNumInFile() + "----------"); if (length != null && length.size() == 1) { int[] l = length.get(0); out.print("\t" + l[0] + "\t" + l[1]); } StringBuffer keyphrases = new StringBuffer(); for (int i = 0; i < prunedPhrases.size(); ++i) { ClassificationInstance inst = prunedPhrases.get(i); out.print("\n" + inst); if (i < 15) { keyphrases.append(inst.getProbableForm() + (i < 14 ? "," : "")); } } out.flush(); return keyphrases.toString(); } public void extractKeyphrases(int fold, int totalFolds, String fileName, boolean serialize) throws Exception { Collection<DocumentData> documents = docSet.determineDocumentSet(fold, totalFolds, false); System.err.println("Determining keyphrases of the test set of " + documents.size() + " documents..."); // m_KPEFilter.setAcceptSynonyms(acceptSynonymsForTesting); m_KPEFilter.initializeFeatureFields(); PrintWriter out = new PrintWriter(fileName); System.err.println("output file: " + fileName); for (DocumentData doc : documents) { List<int[]> length = new ArrayList<>(1); List<ClassificationInstance> rankedPhrases = m_KPEFilter.rankDocumentInstances(docSet.getReader(), length, serialize, doc); int positiveInstances = 0; for (ClassificationInstance ci : rankedPhrases) { if (((Boolean) ci.getClassLabel())) { positiveInstances++; } } System.err.println(doc.getFile() + "\t" + doc.getKeyphrases().size() + "\t" + positiveInstances); // if (m_prune) // rankedPhrases = filterTopInstances(rankedPhrases); String response = writeOutKeyphrases(out, length, rankedPhrases.subList(0, Math.min(m_numPhrases, rankedPhrases.size())), doc); out.println("\nEtalon set: " + doc.getKeyphrases().keySet()); out.println(doc.getFile().replaceAll(".*([CIJH]-\\d+).*", "$1") + (doc.getLineNumInFile() == 0 ? "" : doc.getLineNumInFile()) + " : " + response); } out.close(); } }