package is2.lemmatizer; import is2.data.*; import is2.io.CONLLReader09; import is2.parser.Parser; import is2.tools.IPipe; import is2.util.DB; import is2.util.OptionsSuper; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.util.*; import java.util.Map.Entry; final public class Pipe extends PipeGen implements IPipe { private static final int _MIN_WORDS_MAPPED_BY_SCRIPT = 1; private static final int _MIN_OCCURENT_FOR_SCRIPT_USE = 4; private static final String _F0 = "F0"; private static final String _F1 = "F1", _F2 = "F2", _F3 = "F3", _F4 = "F4", _F5 = "F5", _F6 = "F6", _F7 = "F7", _F8 = "F8", _F9 = "F9", _F10 = "F10"; private static final String _F11 = "F11", _F12 = "F12", _F13 = "F13", _F14 = "F14", _F15 = "F15", _F16 = "F16", _F17 = "F17", _F18 = "F18", _F19 = "F19", _F20 = "F20"; private static final String _F21 = "F21", _F22 = "F22", _F23 = "F23", _F24 = "F24", _F25 = "F25", _F26 = "F26", _F27 = "F27", _F28 = "F28", _F29 = "F29", _F30 = "F30"; private static final String _F31 = "F31", _F32 = "F32", _F33 = "F33", _F34 = "F34", _F35 = "F35", _F36 = "F36", _F37 = "F37", _F38 = "F38", _F39 = "F39", _F40 = "F40"; private static final String _F41 = "F41"; private static int _f0, _f1, _f2, _f3, _f4, _f5, _f6, _f7, _f8, _f9, _f10, _f11, _f12, _f13, _f14, _f15, _f16, _f17, _f18, _f19, _f20; private static int _f21, _f22, _f23, _f24, _f25, _f26, _f27, _f28, _f29, _f30, _f31, _f32, _f33, _f34, _f35, _f36, _f37, _f38, _f39, _f41; public static int _CEND, _swrd, _ewrd; public static final String MID = "MID", END = "END", STR = "STR", OPERATION = "OP"; private CONLLReader09 depReader; public HashMap<String, String> opse = new HashMap<>(); public String[] types; public MFO mf = new MFO(); private D4 z, x; Cluster cl; OptionsSuper options; Long2Int li; public Pipe(OptionsSuper options2, Long2Int l) { options = options2; li = l; } @Override public InstancesTagger createInstances(String file) { InstancesTagger is = new InstancesTagger(); depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE); depReader.startReading(file); mf.register(REL, "<root-type>"); mf.register(POS, "<root-POS>"); Parser.out.print("Registering feature parts "); HashMap<String, Integer> ops = new HashMap<>(); HashMap<String, HashSet<String>> op2form = new HashMap<>(); int ic = 0; int del = 0; HashSet<String> rm = new HashSet<>(); while (true) { SentenceData09 instance1 = depReader.getNext(); if (instance1 == null) { break; } ic++; if (ic % 100 == 0) { del = outValue(ic, del); } String[] labs1 = instance1.labels; for (int i1 = 0; i1 < labs1.length; i1++) { //typeAlphabet.lookupIndex(labs1[i1]); mf.register(REL, labs1[i1]); } String[] w = instance1.forms; for (int i1 = 0; i1 < w.length; i1++) { // saw the first time? if (mf.getValue(WORD, w[i1].toLowerCase()) == -1) { opse.put(instance1.forms[i1].toLowerCase(), instance1.lemmas[i1]); } mf.register(WORD, w[i1].toLowerCase()); } for (int i1 = 0; i1 < w.length; i1++) { mf.register(WORD, w[i1]); } w = instance1.lemmas; for (int i1 = 0; i1 < w.length; i1++) { mf.register(WORD, w[i1]); } for (int i1 = 0; i1 < w.length; i1++) { mf.register(WORD, w[i1].toLowerCase()); } w = instance1.plemmas; for (int i1 = 0; i1 < w.length; i1++) { mf.register(WORD, w[i1]); } for (int i1 = 0; i1 < w.length; i1++) { mf.register(WORD, w[i1].toLowerCase()); } for (int i1 = 0; i1 < w.length; i1++) { registerChars(CHAR, w[i1]); } w = instance1.ppos; for (int i1 = 0; i1 < w.length; i1++) { mf.register(POS, w[i1]); } w = instance1.gpos; for (int i1 = 0; i1 < w.length; i1++) { mf.register(POS, w[i1]); } for (int i1 = 1; i1 < w.length; i1++) { String op = getOperation(instance1, i1); if (ops.get(op) == null) { ops.put(op, 1); } else { ops.put(op, (ops.get(op) + 1)); if (ops.get(op) > 4) { rm.add(instance1.forms[i1].toLowerCase()); } } HashSet<String> forms = op2form.get(op); if (forms == null) { forms = new HashSet<>(); op2form.put(op, forms); } forms.add(instance1.forms[i1].toLowerCase()); } } int countFreqSingleMappings = 0; int sc = 0; ArrayList<Entry<String, Integer>> opsl = new ArrayList<>(); for (Entry<String, Integer> e : ops.entrySet()) { // do not use scripts for infrequent cases or frequent single mappings (der -> die) if (e.getValue() > _MIN_OCCURENT_FOR_SCRIPT_USE && op2form.get(e.getKey()).size() > _MIN_WORDS_MAPPED_BY_SCRIPT) { mf.register(OPERATION, e.getKey()); sc++; opsl.add(e); } else { // do not remove the infrequent cases rm.removeAll(op2form.get(e.getKey())); if (op2form.get(e.getKey()).size() <= 1) { countFreqSingleMappings += op2form.get(e.getKey()).size(); } } } for (String k : rm) { opse.remove(k); } Collections.sort(opsl, new Comparator<Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { return o1.getValue() == o2.getValue() ? 0 : o1.getValue() > o2.getValue() ? 1 : -1; } }); for (Entry<String, Integer> e : opsl) { // Parser.out.println(e.getKey()+" "+e.getValue()); } if (options.clusterFile == null) { cl = new Cluster(); } else { cl = new Cluster(options.clusterFile, mf, 6); } Parser.out.println("\nfound scripts " + ops.size() + " used scripts " + sc); Parser.out.println("found mappings of single words " + countFreqSingleMappings); Parser.out.println("use word maps instead of scripts " + this.opse.size()); // Parser.out.println(" "+opse); Parser.out.println("" + mf.toString()); initFeatures(); mf.calculateBits(); initValues(); depReader.startReading(options.trainfile); int i = 0; long start1 = System.currentTimeMillis(); Parser.out.print("Creating Features: "); is.init(ic, mf); del = 0; while (true) { try { if (i % 100 == 0) { del = outValue(i, del); } SentenceData09 instance1 = depReader.getNext(is); if (instance1 == null) { break; } is.fillChars(instance1, i, _CEND); if (i > options.count) { break; } i++; } catch (Exception e) { DB.println("error in sentnence " + i); e.printStackTrace(); } } long end1 = System.currentTimeMillis(); System.gc(); long mem2 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); Parser.out.print(" time " + (end1 - start1) + " mem " + (mem2 / 1024) + " kb"); types = new String[mf.getFeatureCounter().get(OPERATION)]; for (Entry<String, Integer> e : is2.lemmatizer.MFO.getFeatureSet().get(OPERATION).entrySet()) { types[e.getValue()] = e.getKey(); // Parser.out.println("set pos "+e.getKey()); } Parser.out.println("Num Features: " + mf.size()); return is; } /** * @param is * @param n * @param k * @param wds * @return */ public static String getOperation(Instances is, int n, int k, String[] wds) { String form = wds[is.forms[n][k]]; String olemma = wds[is.glemmas[n][k]]; String s = new StringBuffer(form.toLowerCase()).reverse().toString(); String t = new StringBuffer(olemma.toLowerCase()).reverse().toString(); return getOperation2(s, t); } public static String getOperation(SentenceData09 instance1, int i1) { String s = new StringBuffer(instance1.forms[i1].toLowerCase()).reverse().toString(); String t = new StringBuffer(instance1.lemmas[i1].toLowerCase()).reverse().toString(); return getOperation2(s, t); } public static String getOperation(String si, String ti) { String s = new StringBuffer(si.toLowerCase()).reverse().toString(); String t = new StringBuffer(ti.toLowerCase()).reverse().toString(); return getOperation2(s, t); } private static String getOperation2(String s, String t) { StringBuffer po = new StringBuffer(); String op; if (!s.equals(t)) { int[][] d = StringEdit.LD(s, t); StringEdit.searchPath(s, t, d, po, false); op = po.toString(); } else { op = "0"; // do nothing } return op; } private void registerChars(String type, String word) { for (int i = 0; i < word.length(); i++) { mf.register(type, Character.toString(word.charAt(i))); } } @Override public void initValues() { z = new D4(li); x = new D4(li); x.a0 = s_type; s_pos = mf.getFeatureCounter().get(POS).intValue();//mf.getFeatureBits(POS); s_word = mf.getFeatureCounter().get(WORD); s_type = mf.getFeatureCounter().get(TYPE).intValue();//mf.getFeatureBits(TYPE); s_char = mf.getFeatureCounter().get(CHAR).intValue();//mf.getFeatureBits(CHAR); s_oper = mf.getFeatureCounter().get(OPERATION).intValue();//mf.getFeatureBits(OPERATION); types = new String[mf.getFeatureCounter().get(Pipe.OPERATION)]; for (Entry<String, Integer> e : is2.lemmatizer.MFO.getFeatureSet().get(Pipe.OPERATION).entrySet()) { types[e.getValue()] = e.getKey(); } //wds = new String[mf.getFeatureCounter().get(Pipe.WORD)]; //for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.WORD).entrySet()) wds[e.getValue()] = e.getKey(); z.a0 = s_type; z.a1 = s_oper; z.a2 = s_char; z.a3 = s_char; z.a4 = s_char; z.a5 = s_char; z.a6 = s_char; z.a7 = s_char; x.a0 = s_type; x.a1 = s_oper; x.a2 = s_word; x.a3 = s_word; x.a4 = s_word; x.a5 = s_char; x.a6 = s_char; x.a7 = s_char; } public static int s_pos, s_word, s_type, s_dir, s_dist, s_char, s_oper; /** * Initialize the features. * * @param maxFeatures */ @Override public void initFeatures() { for (int k = 0; k < 50; k++) { mf.register(TYPE, "F" + k); } _f0 = mf.register(TYPE, _F0); _f1 = mf.register(TYPE, _F1); _f2 = mf.register(TYPE, _F2); _f3 = mf.register(TYPE, _F3); _f4 = mf.register(TYPE, _F4); _f5 = mf.register(TYPE, _F5); _f6 = mf.register(TYPE, _F6); _f7 = mf.register(TYPE, _F7); _f8 = mf.register(TYPE, _F8); _f9 = mf.register(TYPE, _F9); _f10 = mf.register(TYPE, _F10); _f11 = mf.register(TYPE, _F11); _f12 = mf.register(TYPE, _F12); _f13 = mf.register(TYPE, _F13); _f14 = mf.register(TYPE, _F14); _f15 = mf.register(TYPE, _F15); _f16 = mf.register(TYPE, _F16); _f17 = mf.register(TYPE, _F17); _f18 = mf.register(TYPE, _F18); _f19 = mf.register(TYPE, _F19); _f20 = mf.register(TYPE, _F20); _f21 = mf.register(TYPE, _F21); _f22 = mf.register(TYPE, _F22); _f23 = mf.register(TYPE, _F23); _f24 = mf.register(TYPE, _F24); _f25 = mf.register(TYPE, _F25); _f26 = mf.register(TYPE, _F26); _f27 = mf.register(TYPE, _F27); _f28 = mf.register(TYPE, _F28); _f29 = mf.register(TYPE, _F29); _f30 = mf.register(TYPE, _F30); _f31 = mf.register(TYPE, _F31); _f32 = mf.register(TYPE, _F32); _f33 = mf.register(TYPE, _F33); _f34 = mf.register(TYPE, _F34); _f35 = mf.register(TYPE, _F35); _f36 = mf.register(TYPE, _F36); _f37 = mf.register(TYPE, _F37); _f38 = mf.register(TYPE, _F38); mf.register(POS, MID); mf.register(POS, STR); mf.register(POS, END); mf.register(TYPE, CHAR); _swrd = mf.register(WORD, STR); _ewrd = mf.register(WORD, END); _CEND = mf.register(CHAR, END); } final public void addCoreFeatures(InstancesTagger is, int ic, int i, int oper, String form, long[] f) { for (int l = f.length - 1; l >= 0; l--) { f[l] = 0; } int formi = is.forms[ic][i]; int wl = is.chars[ic][i][11];//.forms[i].length(); int position = 1 + (i < 3 ? i : 3); int c0 = is.chars[ic][i][0], c1 = is.chars[ic][i][1], c2 = is.chars[ic][i][2], c3 = is.chars[ic][i][3], c4 = is.chars[ic][i][4], c5 = is.chars[ic][i][5]; int e0 = is.chars[ic][i][6], e1 = is.chars[ic][i][7], e2 = is.chars[ic][i][8], e3 = is.chars[ic][i][9], e4 = is.chars[ic][i][10]; int len = is.length(ic); x.v1 = oper; x.v0 = _f0; x.v2 = formi; x.cz3(); f[0] = x.getVal(); f[1] = x.csa(3, position); x.v0 = _f1; x.v2 = formi; x.v3 = i + 1 >= len ? x.v3 = _ewrd : is.forms[ic][i + 1]; x.cz4(); f[2] = x.getVal(); // contains upper case include again!!! short upper = 0; short number = 1; for (int k1 = 0; k1 < wl; k1++) { char c = form.charAt(k1); if (Character.isUpperCase(c)) { if (k1 == 0) { upper = 1; } else { // first char + another if (upper == 1) { upper = 3; } // another uppercase in the word else if (upper == 0) { upper = 2; } } } if (Character.isDigit(c) && k1 == 0) { number = 2; } else if (Character.isDigit(c) && number == 1) { number = 3; } } // contains a number z.v0 = _f21; z.v2 = number; z.cz3(); f[3] = z.getVal(); z.v0 = _f4; z.v1 = oper; z.v2 = c0; z.cz3(); f[4] = z.getVal(); z.v0 = _f5; z.v2 = e0; z.cz3(); f[5] = z.getVal(); z.v2 = c0; z.v3 = c1; z.v4 = c2; z.v5 = c3; z.v6 = c4; z.v0 = _f6; z.cz4(); f[6] = z.getVal(); z.v0 = _f7; z.cz5(); f[7] = z.getVal(); z.v0 = _f8; z.cz6(); f[8] = z.getVal(); z.v0 = _f9; z.cz7(); f[9] = z.getVal(); int c = 10; z.v2 = e0; z.v3 = e1; z.v4 = e2; z.v5 = e3; z.v6 = e4; z.v0 = _f10; z.cz4(); f[c++] = z.getVal(); f[c++] = z.csa(3, upper); z.v0 = _f11; z.cz5(); f[c++] = z.getVal(); f[c++] = z.csa(3, upper); z.v0 = _f12; z.cz6(); f[c++] = z.getVal(); f[c++] = z.csa(3, upper); z.v0 = _f13; z.cz7(); f[c++] = z.getVal(); f[c++] = z.csa(3, upper); if (len > i + 1) { z.v0 = _f14; z.v2 = is.chars[ic][i + 1][0]; z.cz3(); f[c++] = z.getVal(); z.v0 = _f15; z.v2 = is.chars[ic][i + 1][5]; z.cz3(); f[c++] = z.getVal(); if (is.chars[ic][i + 1][11] > 1) { z.v0 = _f16; z.v2 = is.chars[ic][i + 1][0]; z.v3 = is.chars[ic][i + 1][2]; z.cz4(); f[c++] = z.getVal(); z.v0 = _f17; z.v2 = is.chars[ic][i + 1][1]; z.v3 = is.chars[ic][i + 1][6]; z.cz4(); f[c++] = z.getVal();//fv.add(li.l2i(mf.calc4(b))); } x.v0 = _f18; x.v2 = is.forms[ic][i + 1]; x.cz3(); f[c++] = x.getVal(); if (len > i + 2) { x.v0 = _f32; x.v2 = is.forms[ic][i + 2]; x.v3 = is.forms[ic][i + 1]; x.cz4(); f[c++] = x.getVal(); x.cz3(); f[c++] = x.getVal();//fv.add(li.l2i(mf.calc3(b))); } if (len > i + 3) { x.v0 = _f33; x.v2 = is.forms[ic][i + 3]; x.v3 = is.forms[ic][i + 2]; x.cz4(); f[c++] = x.getVal();//fv.add(li.l2i(mf.calc4(b))); x.cz3(); f[27] = x.getVal();//fv.add(li.l2i(mf.calc3(b))); } } // length z.v0 = _f19; z.v1 = oper; z.v2 = wl; z.cz3(); f[c++] = z.getVal();//fv.add(li.l2i(mf.calc3(dl1))); if (i < 1) { return; } x.v0 = _f27; x.v1 = oper; x.v2 = is.forms[ic][i - 1]; x.cz3(); f[c++] = x.getVal();//fv.add(li.l2i(mf.calc3(b))); if (i < 2) { return; } //added this before it was 99.46 x.v0 = _f28; x.v2 = is.forms[ic][i - 2]; x.cz3(); f[c++] = x.getVal();//fv.add(li.l2i(mf.calc3(b))); // result 99.484 if (i < 3) { return; } x.v0 = _f31; x.v1 = oper; x.v2 = is.forms[ic][i - 3]; x.v3 = is.forms[ic][i - 2]; x.cz4(); f[c++] = x.getVal();//fv.add(li.l2i(mf.calc4(b))); } // public String[] wds; /** * Write the lemma that are not mapped by operations * * @param dos */ private void writeMap(DataOutputStream dos) { try { dos.writeInt(opse.size()); for (Entry<String, String> e : opse.entrySet()) { dos.writeUTF(e.getKey()); dos.writeUTF(e.getValue()); } } catch (IOException e1) { e1.printStackTrace(); } } /** * Read the form-lemma mapping not read by operations * * @param dis */ public void readMap(DataInputStream dis) { try { int size = dis.readInt(); for (int i = 0; i < size; i++) { opse.put(dis.readUTF(), dis.readUTF()); } } catch (IOException e1) { e1.printStackTrace(); } } /* * (non-Javadoc) @see is2.tools.IPipe#write(java.io.DataOutputStream) */ @Override public void write(DataOutputStream dos) { this.writeMap(dos); try { cl.write(dos); } catch (IOException e) { e.printStackTrace(); } } }