/*******************************************************************************
* Copyright (c) 2011 Wind River Systems, Inc. and others. All rights reserved.
* This program and the accompanying materials are made available under the terms
* of the Eclipse Public License v1.0 which accompanies this distribution, and is
* available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Wind River Systems - initial API and implementation
*******************************************************************************/
package org.eclipse.tcf.te.core.utils.text;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.eclipse.core.runtime.Assert;
/**
* A class providing useful static method to manipulate strings.
*/
public final class StringUtil {
/**
* Tokenize a list of whitespace-separated arguments into a list.
* <p>
* <b>Note:</b> <i>This method has been designed for the specific needs of tokenizing
* command line arguments to launch external application from Java!</i>
* <p>
* Arguments may be quoted by double quotes. Quotes must not appear
* inside words, or they will lead to new tokens.
* <p>
* Example:<pre><code> a"bc"d --> tokenized into a,bc,d</code></pre>
* <p>
* If <code>maxArgs</code> is greater than 0, then a maximum of <code>maxArgs</code>
* tokens is returned and the resulting array is filled to the given number of arguments
* with empty strings.
* <p>
* If <code>maxArgs</code> is less than or equal 0, the original number of arguments is tokenized.
*
* @param arguments The space separated arguments string. Must not be <code>null</code>.
* @param maxArgs The maximum number of returned tokens or <code>0</code>.
* @param keepQuotes If <code>true</code>, the original arguments quotes are retained, <code>false</code> otherwise.
*
* @return The tokenized string or an empty list.
*/
public static String[] tokenize(String arguments, int maxArgs, boolean keepQuotes) {
Assert.isNotNull(arguments);
// Create the result list
List<String> result = maxArgs > 0 ? new ArrayList<String>(maxArgs) : new ArrayList<String>();
// Arguments sent separately
StreamTokenizer tok = new StreamTokenizer(new StringReader(arguments));
tok.resetSyntax();
// whitespace is everything from 0 to 32 (space)
tok.whitespaceChars(0, 32);
// everything from 33 to 255 is treated as word character
tok.wordChars(33, 255);
// except the 0xa0, is an whitespace too.
tok.whitespaceChars(0xa0, 0xa0);
// the quoting character is the double-quote
tok.quoteChar('"');
// extract only the number of arguments request or unlimited if maxArgs == 0
int nArgs = 0;
while (maxArgs <= 0 || nArgs < maxArgs) {
try {
// get the next token from the stream
int ttype = tok.nextToken();
// if reached end of file, leave the loop
if (ttype == StreamTokenizer.TT_EOF) {
break;
}
if (keepQuotes && ttype == 34) { //quoted word
String quoted = enQuote(tok.sval);
if (quoted.length() < 2 || quoted.charAt(0) != '"' ||
quoted.charAt(quoted.length() - 1) != '"') {
quoted = '"' + quoted + '"';
}
result.add(quoted);
}
else {
result.add(tok.sval);
}
nArgs++;
}
catch (IOException e) {
// on any IO exception, break the loop
break;
}
}
return result.toArray(new String[result.size()]);
}
private final static Pattern ALLOWED_STRING_PATTERN = Pattern.compile("[a-zA-Z0-9_@.-]*"); //$NON-NLS-1$
/**
* Enquote the given string if necessary by putting double quotes around it.
* <p>
* <b>Note:</b> <i>This method has been designed for the specific needs of enquoting
* command line arguments to launch external application from Java!</i>
* <p>
* Characters that require quoting are:
* <dl>
* <dt>/\:;</dt><dd>Because they are file or path separators respectively</dd>
* <dt>\s</dt><dd>All whitespace characters, naturally</dd>
* <dt>|<>{}()$^~</dt><dd>Because they have shell or TCL special meaning</dd>
* <dt>*%?&</dt><dd>Because they are commonly used as wildcard's</dd>
* <dt>#</dt><dd>Because it is commonly used to start comments</dd>
* <dt>'"`</dt><dd>Because they are quoting characters</dd>
* </dl>
* To make a long story short, we only allow ASCII characters, numbers,
* the underscore and <code>@.-</code> to go unquoted.
*
* @param unqouted The string to quote or <code>null</code>.
*
* @return The quoted string or "\"\"".
*/
public static String enQuote(String unqouted) {
if (unqouted == null) {
return "\"\""; //$NON-NLS-1$
}
else if (unqouted.length() == 0) {
return "\"\""; //$NON-NLS-1$
}
else if (ALLOWED_STRING_PATTERN.matcher(unqouted).matches()) {
return unqouted;
}
else {
StringReader r = new StringReader(unqouted);
StringBuffer buf = new StringBuffer(unqouted.length() + 16);
boolean containsWhitespaces =
Pattern.compile("\\s").matcher(unqouted).find(); //$NON-NLS-1$
if (containsWhitespaces) {
buf.append('\"');
}
try {
int c = r.read();
while (c >= 0) {
switch (c) {
// case '\\':
case '\'':
case '\"':
buf.append('\\');
buf.append((char)c);
break;
case '\b':
buf.append("\\b");break; //$NON-NLS-1$
case '\f':
buf.append("\\f");break; //$NON-NLS-1$
case '\n':
buf.append("\\n");break; //$NON-NLS-1$
case '\r':
buf.append("\\r");break; //$NON-NLS-1$
case '\t':
buf.append("\\t");break; //$NON-NLS-1$
default:
if (c > 0xff) { // Unicode
buf.append('\\');
buf.append('u');
String hexString = Integer.toHexString(c);
if (hexString.length() < 4) {
buf.append('0');
}
buf.append(hexString);
}
else if (c < 0x20 || c > '~') {
// octal escape sequence
buf.append('\\');
buf.append(Integer.toOctalString(c));
}
else {
buf.append((char)c);
}
}
c = r.read();
}
}
catch (IOException e) { /* ignore */
}
r.close();
if (containsWhitespaces) {
buf.append('\"');
}
return buf.toString();
}
}
}