/**
* Copyright 2014, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.clir.clearnlp.experiment;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringJoiner;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import edu.emory.clir.clearnlp.collection.map.ObjectIntHashMap;
import edu.emory.clir.clearnlp.collection.pair.ObjectIntPair;
import edu.emory.clir.clearnlp.constituent.CTNode;
import edu.emory.clir.clearnlp.constituent.CTReader;
import edu.emory.clir.clearnlp.constituent.CTTree;
import edu.emory.clir.clearnlp.dependency.DEPLib;
import edu.emory.clir.clearnlp.dependency.DEPNode;
import edu.emory.clir.clearnlp.dependency.DEPTree;
import edu.emory.clir.clearnlp.lexicon.propbank.frameset.PBFFrameset;
import edu.emory.clir.clearnlp.lexicon.propbank.frameset.PBFMap;
import edu.emory.clir.clearnlp.lexicon.propbank.frameset.PBFRole;
import edu.emory.clir.clearnlp.lexicon.propbank.frameset.PBFRoleset;
import edu.emory.clir.clearnlp.lexicon.propbank.frameset.PBFType;
import edu.emory.clir.clearnlp.pos.POSLibEn;
import edu.emory.clir.clearnlp.reader.TSVReader;
import edu.emory.clir.clearnlp.srl.SRLTree;
import edu.emory.clir.clearnlp.util.FileUtils;
import edu.emory.clir.clearnlp.util.IOUtils;
import edu.emory.clir.clearnlp.util.Splitter;
import edu.emory.clir.clearnlp.util.StringUtils;
import edu.emory.clir.clearnlp.util.arc.SRLArc;
/**
* @since 3.0.0
* @author Jinho D. Choi ({@code jinho.choi@emory.edu})
*/
public class Z
{
public Z(String[] args) throws Exception
{
matchPropBankTags();
// String root = "/Users/jdchoi/Desktop/allen/mturk-old/";
// String f0 = root+"AllenFixedVsOpenFutureRuleOld.csv.0.txt.cnlp";
// String f1 = root+"AllenFixedVsOpenFutureRuleOld.csv.1.txt.cnlp";
// String f2 = root+"AllenFixedVsOpenFutureRuleOld.csv.2.txt.cnlp";
// int i, N = 5;
//
// PrintStream[] fout = new PrintStream[N];
// for (i=0; i<N; i++) fout[i] = IOUtils.createBufferedPrintStream(root+"cv"+i+".tst");
//
// crossValidate(f0, fout, N);
// crossValidate(f1, fout, N);
// crossValidate(f2, fout, N);
// for (i=0; i<N; i++) fout[i].close();
}
public void matchPropBankTags() throws Exception
{
String frameDir = "/Users/jdchoi/Downloads/frames";
String inputFile = "/Users/jdchoi/Documents/Data/experiments/general-en/onto.all";
PBFMap map = new PBFMap(frameDir);
TSVReader reader = new TSVReader(0, 1, 2, 3, 4, 5, 6, 7);
reader.open(IOUtils.createFileInputStream(inputFile));
Pattern argn = Pattern.compile("^A(\\d)");
Pattern argm = Pattern.compile("^AM");
Set<String> set = new HashSet<>();
String pb, n, lemma;
PBFRoleset roleset;
DEPTree tree;
PBFRole role;
SRLTree srl;
while ((tree = reader.next()) != null)
{
for (DEPNode node : tree)
{
pb = node.getFeat(DEPLib.FEAT_PB);
if (pb == null || pb.endsWith("LV")) continue;
srl = tree.getSRLTree(node);
lemma = pb.substring(0, pb.length()-3);
roleset = map.getRoleset(PBFType.VERB, lemma, pb);
if (roleset == null) continue;
for (SRLArc arc : srl.getArgumentArcList(argn))
{
n = arc.getLabel().substring(1,2);
role = roleset.getRole(n);
if (role == null) System.out.println(pb+" "+n+" "+arc.getLabel());
}
for (SRLArc arc : srl.getArgumentArcList(argm))
set.add(arc.getLabel());
}
}
List<String> list = new ArrayList<>(set);
Collections.sort(list);
for (String s : list) System.out.println(s);
}
void crossValidate(String inputFile, PrintStream[] fout, int N)
{
TSVReader reader = new TSVReader(0, 1, 2, 3, 4, 5, 6);
reader.open(IOUtils.createFileInputStream(inputFile));
DEPTree tree;
int i = -1;
while ((tree = reader.next()) != null)
{
i = (i+1) % N;
fout[i].println(tree.toString(DEPNode::toStringNER)+"\n");
}
reader.close();
}
public void simplifyTokens(String[] args) throws Exception
{
BufferedReader reader = IOUtils.createBufferedReader(args[0]);
PrintStream fout = IOUtils.createBufferedPrintStream(args[0]+".wop");
Set<String> set = new HashSet<>();
StringJoiner joiner;
String line;
while ((line = reader.readLine()) != null)
{
joiner = new StringJoiner(" ");
for (String s : Splitter.splitSpace(line))
if (!StringUtils.containsPunctuationOnly(s))
joiner.add(StringUtils.toSimplifiedForm(s));
line = StringUtils.toLowerCase(joiner.toString().trim());
if (!line.isEmpty() && !set.contains(line))
{
fout.println(line);
set.add(line);
}
}
reader.close();
fout.close();
}
public void countPOS(String[] args) throws Exception
{
final String inputPath = args[0];
final String outputPath = args[1];
ObjectIntHashMap<String> noun = new ObjectIntHashMap<>();
ObjectIntHashMap<String> verb = new ObjectIntHashMap<>();
TSVReader reader = new TSVReader(0, 1, 2, 3, 4, 5, 6);
DEPTree tree;
for (String filename : FileUtils.getFileList(inputPath, "cnlp", false))
{
reader.open(IOUtils.createFileInputStream(filename));
System.out.println(filename);
while ((tree = reader.next()) != null)
{
for (DEPNode node : tree)
{
if (POSLibEn.isCommonOrProperNoun(node.getPOSTag()))
noun.add(node.getLemma());
else if (POSLibEn.isVerb(node.getPOSTag()))
verb.add(node.getLemma());
}
}
reader.close();
}
print(verb, outputPath+".verb");
print(noun, outputPath+".noun");
}
private void print(ObjectIntHashMap<String> map, String outputFile)
{
PrintStream fout = IOUtils.createBufferedPrintStream(outputFile);
List<ObjectIntPair<String>> list = map.toList();
Collections.sort(list);
for (ObjectIntPair<String> p : list) fout.println(p.o+"\t"+p.i);
fout.close();
}
public void extractARGM() throws Exception
{
TSVReader reader = new TSVReader(0, 1, 2, 3, 4, 5, 6, 7);
ObjectIntHashMap<String> map = new ObjectIntHashMap<>();
DEPTree tree;
reader.open(IOUtils.createFileInputStream("/Users/jdchoi/Documents/Data/general/english/onto.dep"));
while ((tree = reader.next()) != null)
{
for (DEPNode node : tree)
{
for (SRLArc arc : node.getSemanticHeadArcList())
{
if (arc.getLabel().startsWith("AM"))
map.add(arc.getLabel().substring(3));
}
}
}
List<ObjectIntPair<String>> ps = map.toList();
Collections.sort(ps, Collections.reverseOrder());
for (ObjectIntPair<String> p : ps)
System.out.println(p.o+" "+p.i);
}
public void printRaw(String[] args) throws Exception
{
String filename = "/Users/jdchoi/Documents/Data/general/google.parse";
CTReader reader = new CTReader(new FileInputStream(filename));
PrintStream fout = IOUtils.createBufferedPrintStream(filename+".raw");
StringJoiner joiner;
CTTree tree;
while ((tree = reader.nextTree()) != null)
{
joiner = new StringJoiner(" ");
for (CTNode node : tree.getTokenList())
joiner.add(node.getWordForm());
fout.println(joiner.toString());
}
fout.close();
}
class Tmp
{
long i;
public void add(int j)
{
i += j;
}
}
public void frameset(String[] args) throws Exception
{
// PBFMap map = new PBFMap(args[0]);
// ObjectOutputStream out = new ObjectOutputStream(IOUtils.createXZBufferedOutputStream(args[1]));
// out.writeObject(map);
// out.close();
ObjectInputStream in = new ObjectInputStream(IOUtils.createXZBufferedInputStream(args[0]));
PBFMap map = (PBFMap)in.readObject();
Map<String,PBFFrameset> framesets = map.getFramesetMap(PBFType.VERB);
List<Set<String>> argns = new ArrayList<>();
List<String> list; int n;
for (n=0; n<6; n++) argns.add(new HashSet<String>());
Set<String> rolesets = new HashSet<>();
// int count;
final int N = Integer.parseInt(args[1]);
final String TAG = args.length > 2 ? args[2] : "";
for (PBFFrameset frameset : framesets.values())
{
for (PBFRoleset roleset : frameset.getRolesets())
{
// count = 0;
for (PBFRole role : roleset.getRoles())
{
if (StringUtils.containsDigitOnly(role.getArgumentNumber()))
{
n = Integer.parseInt(role.getArgumentNumber());
argns.get(n).add(role.getFunctionTag());
// if (n == 0 && role.isFunctionTag("PPT")) count++;
// if (n == 1 && role.isFunctionTag("PAG")) count++;
// if (n == N && role.isFunctionTag(TAG)) rolesets.add(roleset.getID());
// if (role.isFunctionTag(TAG)) rolesets.add(roleset.getID());
if (n == N && !role.isFunctionTag(TAG)) rolesets.add(roleset.getID());
// if (role.isFunctionTag(TAG)) count++;
}
}
// if (count > 1) rolesets.add(roleset.getID());
}
}
// for (n=0; n<argns.size(); n++)
// {
// list = new ArrayList<>(argns.get(n));
// Collections.sort(list);
// System.out.println(n+" "+list.toString());
// }
list = new ArrayList<>(rolesets);
Collections.sort(list);
for (String s : list) System.out.println(s);
}
@SuppressWarnings("resource")
public void amazon(String[] args) throws Exception
{
BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream("/Users/jdchoi/Downloads/Books.txt.gz"))));
Pattern p = Pattern.compile(" ");
String[] t;
String line;
long l, max = -1, min = Long.MAX_VALUE;
for (long e=1; (line = reader.readLine()) != null; e++)
{
line = line.trim();
if (line.startsWith("review/time:"))
{
t = p.split(line);
l = Long.parseLong(t[1]);
max = Math.max(l, max);
min = Math.min(l, min);
}
if (e%10000000 == 0) System.out.print(".");
} System.out.println();
System.out.println(new Date(max*1000).toString());
System.out.println(new Date(min*1000).toString());
}
public void test()
{
Map<Integer,Double> map;
int i, j, len = 100, size = 1000000;
long st, et;
map = new HashMap<>();
for (j=0; j<len; j++) map.put(j, (double)j);
st = System.currentTimeMillis();
for (i=0; i<size; i++)
{
for (j=0; j<len; j++)
map.compute(j, (k, v) -> v /len);
}
et = System.currentTimeMillis();
System.out.println(et-st);
map = new HashMap<>();
for (j=0; j<len; j++) map.put(j, (double)j);
st = System.currentTimeMillis();
for (i=0; i<size; i++)
{
for (j=0; j<len; j++)
map.computeIfPresent(j, (k, v) -> v /len);
}
et = System.currentTimeMillis();
System.out.println(et-st);
}
static public void main(String[] args)
{
try
{
new Z(args);
}
catch (Exception e) {e.printStackTrace();}
}
}