package clear.experiment; import clear.dep.DepNode; import clear.dep.DepTree; import clear.morph.MorphEnAnalyzer; import clear.reader.SRLReader; import clear.util.IOUtil; import java.io.PrintStream; import java.util.ArrayList; import java.util.HashSet; public class ExtractClusterFeature { final String SLOT_TRUE = " 1"; final String SLOT_FALSE = " 0"; final String[] ARR_DEPREL = {"DTV", "OBJ", "OPRD", "PRD", "ADV", "BNF", "DIR", "EXT", "LOC", "MNR", "PRP", "TMP"}; final String[] ARR_IN = {"of", "to", "in", "for", "on", "at", "by", "from", "with", "as"}; final String[] ARR_PRT = {"up", "out", "off", "down", "in", "back", "on", "over", "away", "around"}; public ExtractClusterFeature(String dicFile, String inputFile, String outputFile) { MorphEnAnalyzer morph = new MorphEnAnalyzer(dicFile); SRLReader reader = new SRLReader(inputFile, true); try (PrintStream fout = IOUtil.createPrintFileStream(outputFile)) { DepTree tree; DepNode node; while ((tree = reader.nextTree()) != null) { for (int i = 1; i < tree.size(); i++) { node = tree.get(i); if (node.isPredicate()) { fout.println(getFeatures(morph, tree, node)); } } } } } String getFeatures(MorphEnAnalyzer morph, DepTree tree, DepNode pred) { StringBuilder build = new StringBuilder(); build.append(morph.getLemma(pred.form, "VB")); build.append(" "); build.append(pred.getFeat("vn")); // VerbNet class ArrayList<DepNode> aDeps = tree.getDependents(pred.id); HashSet<String> sDeprels = tree.getDeprelDepSet(pred.id); int nSlot = 0; nSlot += getOBJ2Feature(aDeps, build); nSlot += getDeprelFeatures(sDeprels, build); nSlot += getPrepFeatures(aDeps, build); nSlot += getParticleFeature(tree, pred, build); build.append(" "); build.append(nSlot); return build.toString(); } int getOBJ2Feature(ArrayList<DepNode> aDeps, StringBuilder build) { int count = 0; for (DepNode node : aDeps) { if (node.isDeprel("OBJ") && ++count > 1) { build.append(SLOT_TRUE); return 1; } } build.append(SLOT_FALSE); return 0; } int getDeprelFeatures(HashSet<String> sDeprels, StringBuilder build) { int count = 0; for (String deprel : ARR_DEPREL) { if (sDeprels.contains(deprel)) { build.append(SLOT_TRUE); count++; } else { build.append(SLOT_FALSE); } } return count; } int getPrepFeatures(ArrayList<DepNode> aDeps, StringBuilder build) { HashSet<String> sIN = new HashSet<>(); for (DepNode node : aDeps) { if (node.isPos("IN") && !node.isDeprel("PRT")) { sIN.add(node.lemma); } } int count = 0; for (String in : ARR_IN) { if (sIN.contains(in)) { build.append(SLOT_TRUE); count++; } else { build.append(SLOT_FALSE); } } return count; } int getParticleFeature(DepTree tree, DepNode pred, StringBuilder build) { String nPrt = tree.getPRT(pred.id); int count = 0; for (String prt : ARR_PRT) { if (prt.equals(nPrt)) { build.append(SLOT_TRUE); count++; } else { build.append(SLOT_FALSE); } } return count; } static public void main(String[] args) { String dicFile = args[0]; String inputFile = args[1]; String outputFile = args[2]; ExtractClusterFeature extractClusterFeature = new ExtractClusterFeature(dicFile, inputFile, outputFile); } }