package edu.stanford.nlp.util; import junit.framework.TestCase; import java.io.IOException; import java.nio.file.Files; import java.util.*; import java.util.regex.Pattern; public class StringUtilsTest extends TestCase { public void testTr() { assertEquals(StringUtils.tr("chris", "irs", "mop"), "chomp"); } public void testGetBaseName() { assertEquals(StringUtils.getBaseName("/u/wcmac/foo.txt"), "foo.txt"); assertEquals(StringUtils.getBaseName("/u/wcmac/foo.txt", ""), "foo.txt"); assertEquals(StringUtils.getBaseName("/u/wcmac/foo.txt", ".txt"), "foo"); assertEquals(StringUtils.getBaseName("/u/wcmac/foo.txt", ".pdf"), "foo.txt"); } public void testArgsToProperties() { Properties p1 = new Properties(); p1.setProperty("fred", "-2"); p1.setProperty("", "joe"); Properties p2 = new Properties(); p2.setProperty("fred", "true"); p2.setProperty("2", "joe"); Map<String,Integer> argNums = new HashMap<>(); argNums.put("fred", 1); assertEquals(StringUtils.argsToProperties(new String[]{"-fred", "-2", "joe"}), p2); assertEquals(StringUtils.argsToProperties(new String[]{"-fred", "-2", "joe"}, argNums), p1); } public void testValueSplit() { List<String> vals1 = StringUtils.valueSplit("arg(a,b),foo(d,e,f)", "[a-z]*(?:\\([^)]*\\))?", "\\s*,\\s*"); List<String> ans1 = Arrays.asList("arg(a,b)", "foo(d,e,f)"); assertEquals("Split failed", ans1, vals1); vals1 = StringUtils.valueSplit("arg(a,b) , foo(d,e,f) , ", "[a-z]*(?:\\([^)]*\\))?", "\\s*,\\s*"); assertEquals("Split failed", ans1, vals1); vals1 = StringUtils.valueSplit(",arg(a,b),foo(d,e,f)", "[a-z]*(?:\\([^)]*\\))?", "\\s*,\\s*"); List<String> ans2 = Arrays.asList("", "arg(a,b)", "foo(d,e,f)"); assertEquals("Split failed", ans2, vals1); List<String> vals3 = StringUtils.valueSplit("\"quoted,comma\",\"with \\\"\\\" quote\" , \"stuff\",or not,quoted,", "\"(?:[^\"\\\\]+|\\\\\")*\"|[^,\"]+", "\\s*,\\s*"); List<String> ans3 = Arrays.asList("\"quoted,comma\"", "\"with \\\"\\\" quote\"", "\"stuff\"", "or not", "quoted"); assertEquals("Split failed", ans3, vals3); } public void testLongestCommonSubstring(){ assertEquals(12,StringUtils.longestCommonSubstring("Jo3seph Smarr!", "Joseph R Smarr")); assertEquals(12,StringUtils.longestCommonSubstring("Joseph R Smarr","Jo3seph Smarr!")); } public void testEditDistance() { // test insert assertEquals(4, StringUtils.editDistance("Hi!","Hi you!")); assertEquals(5, StringUtils.editDistance("Hi!","Hi you!?")); assertEquals(1, StringUtils.editDistance("sdf", "asdf")); assertEquals(1, StringUtils.editDistance("asd", "asdf")); // test delete assertEquals(4, StringUtils.editDistance("Hi you!","Hi!")); assertEquals(5, StringUtils.editDistance("Hi you!?", "Hi!")); assertEquals(1, StringUtils.editDistance("asdf", "asd")); assertEquals(1, StringUtils.editDistance("asdf", "sdf")); // test modification assertEquals(3, StringUtils.editDistance("Hi you!","Hi Sir!")); assertEquals(5, StringUtils.editDistance("Hi you!","Hi Sir!!!")); // test transposition assertEquals(2, StringUtils.editDistance("hello", "hlelo")); assertEquals(2, StringUtils.editDistance("asdf", "adsf")); assertEquals(2, StringUtils.editDistance("asdf", "sadf")); assertEquals(2, StringUtils.editDistance("asdf", "asfd")); // test empty assertEquals(0, StringUtils.editDistance("", "")); assertEquals(3, StringUtils.editDistance("", "bar")); assertEquals(3, StringUtils.editDistance("foo", "")); } public void testSplitOnChar() { assertEquals(3, StringUtils.splitOnChar("hello\tthere\tworld", '\t').length); assertEquals(2, StringUtils.splitOnChar("hello\tworld", '\t').length); assertEquals(1, StringUtils.splitOnChar("hello", '\t').length); assertEquals("hello", StringUtils.splitOnChar("hello\tthere\tworld", '\t')[0]); assertEquals("there", StringUtils.splitOnChar("hello\tthere\tworld", '\t')[1]); assertEquals("world", StringUtils.splitOnChar("hello\tthere\tworld", '\t')[2]); assertEquals(1, StringUtils.splitOnChar("hello\tthere\tworld\n", ' ').length); assertEquals("hello\tthere\tworld\n", StringUtils.splitOnChar("hello\tthere\tworld\n", ' ')[0]); assertEquals(5, StringUtils.splitOnChar("a\tb\tc\td\te", '\t').length); assertEquals(5, StringUtils.splitOnChar("\t\t\t\t", '\t').length); assertEquals("", StringUtils.splitOnChar("\t\t\t\t", '\t')[0]); assertEquals("", StringUtils.splitOnChar("\t\t\t\t", '\t')[1]); assertEquals("", StringUtils.splitOnChar("\t\t\t\t", '\t')[4]); } /* public void testSplitOnCharSpeed() { String line = "1;2;3;4;5;678;901;234567;1"; int runs = 1000000; for (int gcIter = 0; gcIter < 10; ++gcIter) { long start = System.currentTimeMillis(); for (int i = 0; i < runs; ++i) { StringUtils.split(line, ";"); } System.err.println("Old: " + Redwood.formatTimeDifference(System.currentTimeMillis() - start) + " for " + runs + " splits"); start = System.currentTimeMillis(); for (int i = 0; i < runs; ++i) { StringUtils.splitOnChar(line, ';'); } System.err.println("New: " + Redwood.formatTimeDifference(System.currentTimeMillis() - start) + " for " + runs + " splits"); System.err.println(); } } */ public void testStringIsNullOrEmpty() { assertTrue(StringUtils.isNullOrEmpty(null)); assertTrue(StringUtils.isNullOrEmpty("")); assertFalse(StringUtils.isNullOrEmpty(" ")); assertFalse(StringUtils.isNullOrEmpty("foo")); } public void testNormalize() { assertEquals("can't", StringUtils.normalize("can't")); assertEquals("Beyonce", StringUtils.normalize("Beyoncé")); assertEquals("krouzek", StringUtils.normalize("kroužek")); assertEquals("office", StringUtils.normalize("o\uFB03ce")); assertEquals("DZ", StringUtils.normalize("DŽ")); assertEquals("1⁄4", StringUtils.normalize("¼")); assertEquals("한국어", StringUtils.normalize("한국어")); assertEquals("조선말", StringUtils.normalize("조선말")); assertEquals("が", StringUtils.normalize("が")); assertEquals("か", StringUtils.normalize("か")); } private static final char[] escapeInputs = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '"', '"', '"', }; private static final String[] csvInputs = { "", ",", "foo", "foo,bar", "foo, bar", ",foo,bar,", "foo,\"bar\"", "\"foo,foo2\"", "1997, \"Ford\" ,E350", "foo,\"\",bar", "1999,Chevy,\"Venture \"\"Extended Edition, Large\"\"\",,5000.00", "\"\"\",foo,\"", "\"\"\"\",foo", }; private static final String[][] csvOutputs = { {}, {""}, {"foo"}, {"foo", "bar"}, {"foo", " bar"}, {"", "foo", "bar"}, {"foo", "bar"}, {"foo,foo2"}, {"1997"," Ford ","E350"}, {"foo", "", "bar"}, {"1999", "Chevy", "Venture \"Extended Edition, Large\"","", "5000.00"}, {"\",foo,"}, {"\"", "foo"}, }; public void testCSV() { assertEquals("Bung test", csvInputs.length, csvOutputs.length); for (int i = 0; i < csvInputs.length; i++) { String[] answer = StringUtils.splitOnCharWithQuoting(csvInputs[i], ',', '"', escapeInputs[i]); assertTrue("Bad CSV line handling of ex " + i +": " + Arrays.toString(csvOutputs[i]) + " vs. " + Arrays.toString(answer), Arrays.equals(csvOutputs[i], answer)); } } public void testGetCharacterNgrams() { testCharacterNgram("abc", 0, 0); testCharacterNgram("abc", 1, 1, "a", "b", "c"); testCharacterNgram("abc", 2, 2, "ab", "bc"); testCharacterNgram("abc", 1, 2, "a", "b", "c", "ab", "bc"); testCharacterNgram("abc", 1, 3, "a", "b", "c", "ab", "bc", "abc"); testCharacterNgram("abc", 1, 4, "a", "b", "c", "ab", "bc", "abc"); } private void testCharacterNgram(String string, int min, int max, String... expected) { System.out.println(makeSet(expected)); System.out.println(StringUtils.getCharacterNgrams(string, min, max)); assertEquals(makeSet(expected), new HashSet<>(StringUtils.getCharacterNgrams(string, min, max))); } @SafeVarargs private final <T> Set<T> makeSet(T... elems) { return new HashSet<>(Arrays.asList(elems)); } public void testExpandEnvironmentVariables() { Map<String, String> env = new HashMap<String, String>() {{ put("A", "[outA]"); put("A_B", "[outA_B]"); put("a_B", "[outa_B]"); put("a_B45", "[outa_B45]"); put("_A", "[out_A]"); put("3A", "[out_3A]"); }}; assertEquals("xxx [outA] xxx", StringUtils.expandEnvironmentVariables("xxx $A xxx", env)); assertEquals("xxx[outA] xxx", StringUtils.expandEnvironmentVariables("xxx$A xxx", env)); assertEquals("xxx[outA]xxx", StringUtils.expandEnvironmentVariables("xxx${A}xxx", env)); assertEquals("xxx [outA_B] xxx", StringUtils.expandEnvironmentVariables("xxx $A_B xxx", env)); assertEquals("xxx [outa_B] xxx", StringUtils.expandEnvironmentVariables("xxx $a_B xxx", env)); assertEquals("xxx [outa_B45] xxx", StringUtils.expandEnvironmentVariables("xxx $a_B45 xxx", env)); assertEquals("xxx [out_A] xxx", StringUtils.expandEnvironmentVariables("xxx $_A xxx", env)); assertEquals("xxx $3A xxx", StringUtils.expandEnvironmentVariables("xxx $3A xxx", env)); assertEquals("xxx xxx", StringUtils.expandEnvironmentVariables("xxx $UNDEFINED xxx", env)); } public void testDecodeArray() throws IOException { String tempFile1 = Files.createTempFile("test", "tmp").toString(); String tempFile2 = Files.createTempFile("test", "tmp").toString(); String[] decodedArray = StringUtils.decodeArray("'"+tempFile1 + "','" + tempFile2+"'"); assertEquals(2, decodedArray.length); assertEquals(tempFile1, decodedArray[0]); assertEquals(tempFile2, decodedArray[1]); String[] test10 = { "\"C:\\Users\\BELLCH~1\\AppData\\Local\\Temp\\bill-ie5804201486895318826regex_rules.txt\"", "[\"C:\\Users\\BELLCH~1\\AppData\\Local\\Temp\\bill-ie5804201486895318826regex_rules.txt\"]" }; String[] ans10 = { "C:\\Users\\BELLCH~1\\AppData\\Local\\Temp\\bill-ie5804201486895318826regex_rules.txt" }; String[] test11 = { "C:\\Users\\BELLCH~1\\AppData\\Local\\Temp\\bill-ie5804201486895318826regex_rules.txt", "[C:\\Users\\BELLCH~1\\AppData\\Local\\Temp\\bill-ie5804201486895318826regex_rules.txt]" }; String[] ans11 = { "C:UsersBELLCH~1AppDataLocalTempbill-ie5804201486895318826regex_rules.txt" }; for (String s : test10) { assertEquals(Arrays.asList(ans10), Arrays.asList(StringUtils.decodeArray(s))); } for (String s : test11) { assertEquals(Arrays.asList(ans11), Arrays.asList(StringUtils.decodeArray(s))); } } public void testRegexGroups() { List<String> ans = Arrays.asList("42", "123", "1965"); assertEquals(ans, StringUtils.regexGroups(Pattern.compile("(\\d+)\\D*(\\d+)\\D*(\\d+)"), "abc-x42!123 -1965.")); } }