/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.experiment; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import org.kohsuke.args4j.Option; import edu.emory.clir.clearnlp.collection.map.ObjectIntHashMap; import edu.emory.clir.clearnlp.collection.pair.ObjectIntPair; import edu.emory.clir.clearnlp.util.BinUtils; import edu.emory.clir.clearnlp.util.CharTokenizer; import edu.emory.clir.clearnlp.util.DSUtils; import edu.emory.clir.clearnlp.util.FileUtils; import edu.emory.clir.clearnlp.util.Joiner; import edu.emory.clir.clearnlp.util.Splitter; import edu.emory.clir.clearnlp.util.constant.CharConst; import edu.emory.clir.clearnlp.util.constant.StringConst; /** * @since 3.0.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class Kaist2CoNLL { @Option(name="-i", usage="input path (required)", required=true, metaVar="<filepath>") private String s_inputPath; @Option(name="-ie", usage="input file extension (default: .*)", required=false, metaVar="<regex>") private String s_inputExt = ".*"; @Option(name="-oe", usage="output file extension (required)", required=true, metaVar="<string>") private String s_outputExt; @Option(name="-ir", usage="if set, process all files under the input path recursively.", required=false, metaVar="<boolean>") private boolean b_recursive = false; @Option(name="-src", usage="encoding of source files (default: euc-kr)", required=false, metaVar="<string>") private String s_source = "euc-kr"; @Option(name="-trg", usage="encoding of target files (default: utf8)", required=false, metaVar="<string>") private String s_target = "utf8"; private final String S_REPL = "_P_"; private final String S_SLASH = "\\//sp"; private final CharTokenizer T_PLUS = new CharTokenizer(CharConst.PLUS); private final Pattern P_PLUS = Pattern.compile("\\\\\\+"); private final Pattern P_REPL = Pattern.compile(S_REPL); public Kaist2CoNLL(String[] args) { BinUtils.initArgs(args, this); try { encode(s_inputPath, s_inputExt, s_outputExt, b_recursive, s_source, s_target); } catch (IOException e) {e.printStackTrace();} } public void encode(String inputPath, String inputExtension, String outputExtension, boolean recursive, String source, String target) throws IOException { ObjectIntHashMap<String> map = new ObjectIntHashMap<>(); List<String> list = new ArrayList<>(); StringBuilder build; BufferedReader fin; String line, conll; PrintStream fout; int total = 0; for (String inputFile : FileUtils.getFileList(inputPath, inputExtension, recursive)) { fin = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), source)); build = new StringBuilder(); while ((line = fin.readLine()) != null) { line = line.trim(); if (line.isEmpty()) { conll = toCoNLL(list, map); if (conll != null) { build.append(conll); build.append(StringConst.NEW_LINE); total += list.size(); } list = new ArrayList<>(); } else list.add(line); } fin.close(); if (build.length() > 0) { fout = new PrintStream(new BufferedOutputStream(new FileOutputStream(inputFile+"."+outputExtension), 65536), false, target); fout.println(build.toString()); fout.close(); } else System.err.println("Empty file: "+inputFile); } List<ObjectIntPair<String>> ls = map.toList(); DSUtils.sortReverseOrder(ls); System.out.println("WC: "+total); for (ObjectIntPair<String> p : ls) System.out.println(p.o+" "+p.i); } private String toCoNLL(List<String> list, ObjectIntHashMap<String> gmap) { if (list.isEmpty()) return null; ObjectIntHashMap<String> map = new ObjectIntHashMap<>(); StringBuilder build = new StringBuilder(); int i, size = list.size(); String conll; for (i=0; i<size; i++) { conll = toCoNLL(map, list.get(i), i+1); if (conll == null) return null; build.append(conll); build.append(StringConst.NEW_LINE); } for (ObjectIntPair<String> p : map) gmap.add(p.o, p.i); return build.toString(); } private String toCoNLL(ObjectIntHashMap<String> map, String line, int id) { List<String> lemma = new ArrayList<>(); List<String> pos = new ArrayList<>(); String[] t = Splitter.splitTabs(line); String form, morph, m, p; int idx; if (t.length < 2) { if (line.equals(S_SLASH)) { t = new String[]{StringConst.FW_SLASH, S_SLASH}; } else { System.err.println("Incomplete: "+id+" "+line); return null; } } form = t[0]; morph = P_PLUS.matcher(t[1]).replaceAll(S_REPL); for (String s : T_PLUS.tokenize(morph)) { idx = s.lastIndexOf(CharConst.FW_SLASH); if (idx <= 0 || idx+1 >= s.length()) return null; m = P_REPL.matcher(s.substring(0, idx)).replaceAll("\\"+StringConst.PLUS); p = s.substring(idx+1); if (p.equals("eff")) p = "ef"; lemma.add(m); pos.add(p); map.add(p); } if (lemma.isEmpty()) { System.err.println("Empty: "+id+" "+line); return null; } StringBuilder build = new StringBuilder(); build.append(id); build.append(StringConst.TAB); build.append(form); build.append(StringConst.TAB); build.append(Joiner.join(lemma, StringConst.PLUS)); build.append(StringConst.TAB); build.append(Joiner.join(pos, StringConst.PLUS)); build.append(StringConst.TAB); build.append("_\t_\t_\t_\t_"); return build.toString(); } static public void main(String[] args) { new Kaist2CoNLL(args); } }