package edu.cmu.minorthird.util; import java.io.*; import java.util.regex.*; import edu.cmu.minorthird.text.*; /** * Line processing utilities. * Matcher for regular expressions, * adding features to stringBuffer in svmformat, etc * * @author Vitor R. Carvalho (vitor [at] cs..cmu...) */ public class LineProcessingUtil { /** Returns true if substring in input (or part of it) matches the pattern. * @param patternStr regexp (in String format) * @param tmpstr line to be matched to regexp (in String format) * @return true (if pattern is matched) or false (otherwise) * * */ public static boolean lineMatcher(String patternStr, String tmpstr) { int strsize = tmpstr.length(); CharSequence tmp = tmpstr.subSequence(0, strsize); Pattern pattern = Pattern.compile(patternStr); Matcher matcher = pattern.matcher(tmp); if (matcher.find()) { return true; } return false; } /** * If the line substring matches the regexp, * it adds a " featurename=1" to the string buffer * * It is useful for producing external datasets in Minorthird format * * @param line in String format * @param regexp in String format * @param featureName feature name to be added, in case the regexp matches the line substring * @param features_out StringBuffer to which the feature should be added * **/ public static void addFeature(String line, String regexp, String featureName, StringBuffer features_out) { if (lineMatcher(regexp, line)){ features_out.append(" "+featureName+"=1"); } } /** * Returns the percentage of punctuation (\p{punct}) characters in a line * * @param line in String format * @return a double with the percentage of characters **/ public static double punctuationPercentage(String line) { int linelength = line.length(); if(linelength==0) return 0; int punctCount = 0; for (int i=0; i<linelength; i++){ if(lineMatcher("\\p{Punct}",line.substring(i,i+1))){ punctCount++; } } double perc = punctCount/(double)linelength; return perc; } /** * Returns the percentage of A-Z or a-z characters in a line * * @param line in String format * @return the percentage of [a-z] or [A-Z] characters in the line **/ public static double AtoZPercentage(String line) { int linelength = line.length(); if(linelength==0) return 0; int punctCount = 0; for (int i=0; i<linelength; i++){ if(lineMatcher("a-zA-Z",line.substring(i,i+1))){ punctCount++; } } double perc = punctCount/(double)linelength; return perc; } /** * Returns the percentage characters [\w] in a line * * @param line in String format * @return the percentage of "\w" characters in the line **/ public static double wordCharactersPercentage(String line) { int linelength = line.length(); if(linelength==0) return 0; int punctCount = 0; for (int i=0; i<linelength; i++){ if(lineMatcher("\\w",line.substring(i,i+1))){ punctCount++; } } double perc = punctCount/(double)linelength; return perc; } /** * returns the percentage of tabs in a line * * @param line in String format * @return the percentage of "\t" characters in the line **/ public static double indentPercentage(String line) { int linelength = line.length(); if(linelength==0) return 0; int punctCount = 0; for (int i=0; i<linelength; i++){ if(lineMatcher("\\t",line.substring(i,i+1))){ punctCount++; } } double perc = punctCount/(double)linelength; return perc; } /* * Returns the number of indentations or tabs ("\t") in a line * * @param line in String format * @return the number of "\t" characters in the line * **/ public static int indentNumber(String line) { int linelength = line.length(); if(linelength==0) return 0; int punctCount = 0; for (int i=0; i<linelength; i++){ if(lineMatcher("\\t",line.substring(i,i+1))){ punctCount++; } } return punctCount; } /* * Returns the number of times a certain expression happened in a line * * @param - the expression to be counted (for instance: "Would you") * @param line in String format * @return the number of times the expression happened in the line * **/ public static int numberOfMatches(String expression, String line) { int linelength = line.length(); int exprelength = expression.length(); if ((linelength == 0)||(exprelength==0)) return 0; int theCount = 0; for (int i = 0; i < (linelength - exprelength); i++) { if (lineMatcher(expression, line.substring(i, i + exprelength))) { theCount++; i +=exprelength; } } //System.out.println("count = "+theCount); return theCount; } /** * detect a sequence of 2 lines starting with the same * punctuation (\p{Punct}) character * * @param tmp line1 in String format * @param tmp1 line2 in String format * @return true, if both lines start with same punctuation symbol * */ public static boolean startWithSameInitialPunctCharacters(String tmp, String tmp1){ if((tmp.length()>0)&&(tmp1.length()>0)){ String ind = tmp.substring(0,1);//get first character if (LineProcessingUtil.lineMatcher("\\p{Punct}",ind)){ String ind2 = tmp1.substring(0,1); if(ind2.compareTo(ind)==0) { return true; } } } return false; } /** * Method to split a message (string format) into lines * @param tmp message as String * @return message lines in a String[] */ public static String[] getMessageLines(String tmp){ String[] outL = tmp.split("\n"); return outL; } /** * Method to read a file and turn it into a string - based on rcwang's code * * @param in String with the name of file * @return the original fine in a String format * */ public static String readFile(String in) throws IOException { String line = null; StringBuffer content = new StringBuffer(""); BufferedReader bReader = new BufferedReader(new FileReader(in)); while ((line = bReader.readLine()) != null) { content.append(line + "\n"); } bReader.close(); return content.toString(); //return the contents of the file in a string format } /** Writes the contents of a String Buffer to an output file * * @param outputFileName output File name (as a String) * @param aux string buffer to be written to output file */ public static void writeToOutputFile(String outputFileName, StringBuffer aux) throws IOException { BufferedWriter bWriter = new BufferedWriter(new FileWriter(outputFileName)); bWriter.write(aux.toString()); bWriter.close(); } //don't use this public static TextLabels readBsh(File dir, File envfile) throws Exception { System.out.println("reading data files"); TextBaseLoader tbl = new TextBaseLoader(TextBaseLoader.DOC_PER_FILE, true); tbl.load(dir); TextLabels lala = tbl.getLabels(); TextBase basevitor = lala.getTextBase(); TextLabelsLoader labelLoaderVitor = new TextLabelsLoader(); System.out.println("reading env file..."); labelLoaderVitor.importOps((MutableTextLabels)lala, basevitor, envfile); return lala; } }