/* Copyright (C) 2010 by * * Cam-Tu Nguyen * ncamtu@ecei.tohoku.ac.jp or ncamtu@gmail.com * * Xuan-Hieu Phan * pxhieu@gmail.com * * College of Technology, Vietnamese University, Hanoi * Graduate School of Information Sciences, Tohoku University * * JVnTextPro-v.2.0 is a free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * JVnTextPro-v.2.0 is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with JVnTextPro-v.2.0); if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ package jvnsensegmenter; import java.util.*; import java.io.*; import jvntextpro.util.StringUtils; // TODO: Auto-generated Javadoc /** * The Class FeatureGenerator. * * @author TuNC */ public class FeatureGenerator { /** * The main method. * * @param args the arguments */ public static void main(String [] args ){ if (args.length != 3){ printUsage(); System.exit(1); } boolean label = (args[0].toLowerCase().trim().equals("-lbl")); try{ String inputWhat = args[1].toLowerCase().trim(); if (inputWhat.equals("-inputfile")) { BufferedReader in = new BufferedReader(new InputStreamReader( new FileInputStream(args[2]), "UTF-8")); BufferedWriter out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(args[2] + ".tagged"), "UTF-8")); String text = "", line = ""; while ((line = in.readLine()) != null){ text += "\n" + line; } text = text.trim(); //text normalization text = text.replaceAll("([\t\n\r ])+", "$1"); text = text.replaceAll("[\\[\\]]", ""); text = text.replaceAll("<[^<>]*>", ""); List MarkList = new ArrayList(); ArrayList recordList = (ArrayList) doFeatureGen( new HashMap(), text , MarkList, label) ; for (int i = 0; i < recordList.size(); ++i){ out.write(recordList.get(i).toString()); out.write("\n"); } in.close(); out.close(); } else if (inputWhat.equals("-inputdir")){ BufferedWriter out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(args[2] + ".tagged"), "UTF-8")); File inputDir = new File(args[2]); File [] childrent = inputDir.listFiles(); for (int i = 0; i <childrent.length; ++i) { //go through all the file in the input file and do feagen BufferedReader in = new BufferedReader(new InputStreamReader( new FileInputStream(childrent[i]), "UTF-8")); String text = "", line = ""; while ((line = in.readLine()) != null){ text += "\n" + line; } text = text.trim(); //text normalization text = text.replaceAll("([\t\n\r ])+", "$1"); text = text.replaceAll("[\\[\\]{}]", ""); text = text.replaceAll("<[^<>]*>", ""); List MarkList = new ArrayList(); ArrayList recordList = (ArrayList) doFeatureGen( new HashMap(), text , MarkList, label) ; for (int j = 0; j < recordList.size(); ++j){ out.write(recordList.get(j).toString()); out.write("\n"); } in.close(); } out.close(); } else printUsage(); } catch (Exception e) { System.out.println("In feature generator main : " + e.getMessage()); return; } } /** * Prints the usage. */ public static void printUsage(){ System.out.println("Usage: FeatureGeneration -lbl/-unlbl -inputfile/-inputdir [input file/input dir]"); } /** * Read abbr list. * * @param dataFile the data file * @param map the map * @throws IOException Signals that an I/O exception has occurred. */ public static void readAbbrList(String dataFile, Map map) throws IOException { BufferedReader fin = new BufferedReader(new FileReader(dataFile)); String line; while ((line = fin.readLine()) != null) { StringTokenizer strTok = new StringTokenizer(line, " \t\r\n"); if (strTok.countTokens() <= 0) { continue; } String token = strTok.nextToken(); map.put(token.toLowerCase(), token.toLowerCase()); } } /** * Generate context predicates for a specified text, return string representing the context predicates. * * @param map the map * @param text the text * @param markList the mark list * @param label the label * @return the list */ public static List doFeatureGen(Map map, String text , List markList, boolean label){ markList.clear(); //Find out positions of .!? and store them in the markList int nextPos = 0; while( (nextPos = StringUtils.findFirstOf(text, ".!?", nextPos + 1)) != -1) markList.add(new Integer(nextPos)); //Generate context predicates at those positions List results = new ArrayList(); for (int i = 0; i < markList.size(); ++i){ int curPos = ((Integer) markList.get(i)).intValue(); String record = genCPs(map, text, curPos); //Assign label to feature string if it is specified if (label){ int idx = StringUtils.findFirstNotOf(text, " \t", curPos + 1); if (idx == -1 || (text.charAt(idx) == '\n')){ //end of sentence record += " " + "y"; } else record += " " + "n"; } results.add(record); } return results; } /** * get context predicates at a specified position in the sequence. * * @param map the map * @param text the text * @param position the position * @return the string */ private static String genCPs(Map map, String text, int position){ //get the current token(containing this mark) and its suffix & prefix String token = "", suffix = "", prefix = ""; int idx1 = -1, idx2 = -1, idx; idx1 = StringUtils.findLastOf(text, " \t\n\r", position); if (idx1 == -1) idx1 = 0; idx2 = StringUtils.findFirstOf(text, " \t\n\r", position + 1); if (idx2 == -1) idx2 = text.length(); token = text.substring(idx1 + 1, idx2); if (position + 1 < idx2) suffix = text.substring(position + 1, idx2).trim(); if (idx1 + 1 < position) prefix = text.substring(idx1 + 1, position).trim(); //get the previous token idx = idx2; // save idx2 for get preToken later //get the previous token String preToken = ""; if (idx1 != 0 ){ idx2 = StringUtils.findLastNotOf(text, " \t\n\r", idx1); idx1 = StringUtils.findLastOf(text, " \t\n\r", idx2); if (idx1 == -1) idx1 = 0; if (idx2 != -1) preToken = text.substring(idx1, idx2 + 1).trim(); } //get the next token String nexToken = ""; idx2 = idx; if (idx2 != text.length()){ idx1 = StringUtils.findFirstNotOf(text, " \t\n\r", idx2 + 1); idx2 = StringUtils.findFirstOf(text, " \t\n\r", idx1); if (idx2 == -1) idx2 = text.length(); if (idx1 != -1) nexToken = text.substring(idx1, idx2).trim(); } //generating context predicates String cps = ""; // 01:tok= cps += " 01=" + token; // 02:tok-lower cps += " 02=" + token.toLowerCase(); if (StringUtils.isFirstCap(token)) { // 03:tok-first-cap cps += " 03"; } if (map.containsKey(token.toLowerCase())) { // 04:tok-in-abbrlist cps += " 04"; } if (StringUtils.containNumber(token)) { // 05:tok-has-num cps += " 05"; } if (StringUtils.containLetter(token)) { // 06:tok-has-let cps += " 06"; } if (StringUtils.containLetterAndDigit(token)) { // 07:tok-has-let-num cps += " 07"; } if (StringUtils.isAllNumber(token)) { // 08:tok-is-all-num cps += " 08"; } // 09:tok-countstop cps += " 09=" + Integer.toString(StringUtils.countStops(token)); // 10:tok-countsign cps += " 10=" + Integer.toString(StringUtils.countPuncs(token)); // 11:tok-pre cps += " 11=" + prefix; // 12:tok-pre-lower cps += " 12=" + prefix.toLowerCase(); if (StringUtils.isFirstCap(prefix)) { // 13:tok-pre-first-cap cps += " 13"; } // 14:tok-suf cps += " 14=" + suffix; // 15:tok-suf-lower cps += " 15=" + suffix.toLowerCase(); if (StringUtils.isFirstCap(suffix)) { // 16:tok-suf-first-cap cps += " 16"; } if (preToken != "") { // 17:pre-tok cps += " 17=" + preToken; // 18:pre-tok-lower cps += " 18=" + preToken.toLowerCase(); if (StringUtils.isFirstCap(preToken)) { // 19:pre-tok-first-cap cps += " 19"; } if (map.containsKey(preToken.toLowerCase())) { // 20:pre-tok-in-abbrlist cps += " 20"; } if (StringUtils.containNumber(preToken)) { // 21:pre-tok-has-num cps += " 21"; } if (StringUtils.containLetter(preToken)) { // 22:pre-tok-has-let cps += " 22"; } if (StringUtils.containLetterAndDigit(preToken)) { // 23:pre-tok-has-let-num cps += " 23"; } if (StringUtils.isAllNumber(preToken)) { // 24:pre-tok-is-allnum cps += " 24"; } // 25:pre-tok-countstop cps += " 25=" + Integer.toString(StringUtils.countStops(preToken)); // 26:pre-tok-countsign cps += " 26=" + Integer.toString(StringUtils.countPuncs(preToken)); } else { // 27:pre-tok cps += " 27=null"; } if (nexToken != "") { // 28:nex-tok cps += " 28=" + nexToken; // 29:nex-tok-lower cps += " 29=" + nexToken.toLowerCase(); if (StringUtils.isFirstCap(nexToken)) { // 30:nex-tok-first-cap cps += " 30"; } if (map.containsKey(nexToken.toLowerCase())) { // 31:nex-tok-in-abbrlist cps += " 31"; } if (nexToken.startsWith("\"") || nexToken.startsWith("''") || nexToken.startsWith("``") || nexToken.startsWith("'") || nexToken.startsWith("`")) { cps += " 39"; } if (StringUtils.isFirstCap(nexToken)) { cps += " 40"; } if (StringUtils.containNumber(nexToken)) { // 32:nex-tok-has-num cps += " 32"; } if (StringUtils.containLetter(nexToken)) { // 33:nex-tok-has-let cps += " 33"; } if (StringUtils.containLetterAndDigit(nexToken)) { // 34:nex-tok-has-let-num cps += " 34"; } if (StringUtils.isAllNumber(nexToken)) { // 35:nex-tok-is-allnum cps += " 35"; } // 36:nex-tok-countstop cps += " 36=" + Integer.toString(StringUtils.countStops(nexToken)); // 37:nex-tok-countsign cps += " 37=" + Integer.toString(StringUtils.countPuncs(nexToken)); } else { // 38:nex-tok cps += " 38=null"; } //extra context predicates for Vietnamese sensegment //39:tok-has-@ if (token.contains("@")) cps += " 39"; //40:len-of-prefix cps += " 40=" + prefix.length(); //41:len-of-suffix cps += " 41=" + suffix.length(); //42:tok-has-slash if (token.contains("/")) cps += " 42"; //43:nex-tok-first_char if (nexToken != "") cps += " 43=" + nexToken.charAt(0); return cps.trim(); } }