InputStreamTokenizer.java example

Explorer

delcyon-capo-master
- java
  - com
    - delcyon
      - capo
        CapoApplication.java
        CapoThreadFactory.java
        Configuration.java
        ContextThread.java
        InterruptibleRunnable.java
        annotations
        ControlNamespaceURI.java
        DefaultDocumentProvider.java
        DirectoyProvider.java
        XmlMappedArrays.java
        client
        CapoClient.java
        controller
        AbstractClientSideControl.java
        AbstractControl.java
        ControlElement.java
        ControlElementProvider.java
        Group.java
        LocalRequestProcessor.java
        client
        ClientSideControl.java
        ControllerRequest.java
        ServerControllerResponse.java
        elements
        AppendElement.java
        CallElement.java
        ChooseElement.java
        CommandElement.java
        CreateElement.java
        DebugElement.java
        DiffElement.java
        ExportElement.java
        GroupElement.java
        ImportElement.java
        InsertBeforeElement.java
        LogElement.java
        OpenElement.java
        OtherwiseElement.java
        ParseElement.java
        RemoteGroupElement.java
        RemoteGroupMessage.java
        RemoveElement.java
        RepeatElement.java
        ReplaceElement.java
        RequestElement.java
        ResourceControlElement.java
        ResourceMetaDataElement.java
        ResourceMonitorElement.java
        RestartElement.java
        SetAttributeElement.java
        SetIDElement.java
        SnapshotElement.java
        StepElement.java
        SyncElement.java
        TaskElement.java
        TransformElement.java
        UpdateElement.java
        VarElement.java
        WhenElement.java
        server
        ClientControllerRequest.java
        ControllerClientRequestProcessor.java
        ControllerProcessingException.java
        ControllerResponse.java
        ServerSideControl.java
        crypto
        CertificateRequest.java
        CertificateRequestProcessor.java
        datastream
        AccessibleByteArrayOutputStream.java
        BufferedSocket.java
        ConsoleOutputStreamFilter.java
        NullOutputStream.java
        OutputStreamAttributeFilterProvider.java
        RegexFilterOutputStream.java
        SocketFinalizer.java
        StreamEventFilterInputStream.java
        StreamEventFilterOutputStream.java
        StreamEventListener.java
        StreamFinalizer.java
        StreamHandler.java
        StreamProcessor.java
        StreamProcessorProvider.java
        StreamUtil.java
        TriggerFilterOutputStream.java
        stream_attribute_filter
        AbstractFilterInputStream.java
        ContentFormatTypeFilterInputStream.java
        ContentFormatTypeFilterOutputStream.java
        InputStreamAttributeFilterProvider.java
        MD5FilterInputStream.java
        MD5FilterOutputStream.java
        MimeTypeFilterInputStream.java
        SizeFilterInputStream.java
        SizeFilterOutputStream.java
        StreamAttributeFilter.java
        exceptions
        MissingAttributeException.java
        http
        HTTPStreamConsumer.java
        SimpleHttpRequest.java
        SimpleHttpResponse.java
        modules
        ModuleProvider.java
        ModuleRequest.java
        ModuleRequestProcessor.java
        parsers
        GrammarParser.java
        ParseNode.java
        ParseRule.java
        ParseTape.java
        ParseToken.java
        ParseTree.java
        Tokenizer.java
        preferences
        Preference.java
        PreferenceInfo.java
        PreferenceInfoHelper.java
        PreferenceProvider.java
        protocol
        client
        CapoConnection.java
        Request.java
        XMLRequest.java
        XMLServerResponse.java
        XMLServerResponseProcessor.java
        XMLServerResponseProcessorProvider.java
        server
        AbstractClientRequestProcessor.java
        AbstractResponse.java
        ClientRequest.java
        ClientRequestProcessor.java
        ClientRequestProcessorProvider.java
        ClientRequestProcessorSession.java
        ClientRequestProcessorSessionManager.java
        ClientRequestXMLProcessor.java
        Response.java
        XMLResponse.java
        resourcemanager
        CapoDataManager.java
        ContentFormatType.java
        ErrorResourceDescriptor.java
        ResourceDescriptor.java
        ResourceListener.java
        ResourceManager.java
        ResourceParameter.java
        ResourceParameterBuilder.java
        ResourceType.java
        ResourceTypeProvider.java
        ResourceURI.java
        remote
        RemoteResourceDescriptorMessage.java
        RemoteResourceDescriptorProxy.java
        RemoteResourceRequest.java
        RemoteResourceResponse.java
        RemoteResourceResponseProcessor.java
        RemoteResourceType.java
        types
        AbstractContentMetaData.java
        AbstractResourceDescriptor.java
        AbstractResourceType.java
        ClientsResourceDescriptor.java
        ClientsResourceType.java
        ContentMetaData.java
        FileResourceContentMetaData.java
        FileResourceDescriptor.java
        FileResourceType.java
        HttpResourceDescriptor.java
        HttpResourceType.java
        JcrContentMetaData.java
        JcrResourceDescriptor.java
        JcrResourceType.java
        JcrVersionContentMetaData.java
        JdbcResourceDescriptor.java
        JdbcResourceType.java
        RefResourceDescriptor.java
        RefResourceType.java
        ShellResourceDescriptor.java
        ShellResourceType.java
        SimpleContentMetaData.java
        StateParameters.java
        Versionable.java
        server
        CapoServer.java
        jackrabbit
        CapoJcrServer.java
        jetty
        CapoJettyServer.java
        tasks
        TaskManagerDocumentUpdaterThread.java
        TaskManagerThread.java
        util
        CloneControl.java
        CommandExecution.java
        ControlledClone.java
        EqualityProcessor.java
        HexUtil.java
        InternHashMap.java
        LeveledConsoleHandler.java
        LogPrefixFormatter.java
        MarshalWrapper.java
        MarshalWrapperInterface.java
        NamespaceContextMap.java
        ReflectionUtility.java
        StacktraceElementMarshalWrapper.java
        ToStringControl.java
        VariableContainerWrapper.java
        XMLAttribute.java
        XMLElement.java
        XMLSerializer.java
        diff
        Diff.java
        DiffDataConsumer.java
        DiffDataProvider.java
        DiffEntry.java
        InputStreamTokenizer.java
        Window.java
        WindowItem.java
        WindowItemLink.java
        XMLTextDiff.java
        webapp
        models
        DomItemModel.java
        ResourceDescriptorItemModel.java
        WContentMetaDataItemModel.java
        servlets
        CapoWebApplication.java
        CapoWebWTServlet.java
        resource
        AbstractResourceServlet.java
        DefaultResourceStreamer.java
        ResourceStreamer.java
        WResourceDescriptor.java
        WebResourcesServlet.java
        widgets
        CapoWTreeView.java
        WAceEditor.java
        WBoundedContainerWidget.java
        WCSSItemDelegate.java
        WCapoResourceEditor.java
        WCapoResourceExplorer.java
        WCapoResourceTreeView.java
        WCapoSearchControl.java
        WCapoXmlTreeView.java
        WConsoleWidget.java
        WCursorState.java
        WDiffWidget.java
        WLoginControl.java
        WResourceFactory.java
        WTailFileWidget.java
        WValidatorFactory.java
        WWindowAnchor.java
        WWorker.java
        WXMLEditor.java
        WXmlElementEditor.java
        WXmlNavigationBar.java
        xml
        CapoXPathFunction.java
        CapoXPathFunctionResolver.java
        XMLDiff.java
        XMLProcessor.java
        XMLProcessorProvider.java
        XMLStreamProcessor.java
        XPath.java
        XPathFunctionProcessor.java
        XPathFunctionProvider.java
        XPathFunctionUtility.java
        cdom
        CAttr.java
        CComment.java
        CDOMEvent.java
        CDOMEventListener.java
        CDOMHandler.java
        CDOMImplementation.java
        CDocument.java
        CDocumentBuilder.java
        CDocumentBuilderFactory.java
        CDocumentType.java
        CElement.java
        CNamedNodeMap.java
        CNode.java
        CNodeDefinition.java
        CNodeList.java
        CNodeValidator.java
        CNodeValidator2.java
        CProcessingInstruction.java
        CText.java
        CValidationException.java
        NodeProcessor.java
        NodeValidationUtilitesFI.java
        OccurancePredicate.java
        VariableContainer.java
        VariableProcessor.java
        dom
        ResourceAttr.java
        ResourceDeclarationElement.java
        ResourceDocument.java
        ResourceDocumentBuilder.java
        ResourceElement.java
        ResourceElementResourceDescriptor.java
        ResourceElementResourceType.java
        ResourceNode.java
        ResourceText.java
  - eu
    - medsea
      - mimeutil
        MimeException.java
        MimeType.java
        MimeTypeHashSet.java
        MimeUtil.java
        MimeUtil2.java
        TextMimeDetector.java
        TextMimeType.java
        detector
        ExtensionMimeDetector.java
        InvalidMagicMimeEntryException.java
        MagicMimeEntry.java
        MagicMimeEntryOperation.java
        MagicMimeMimeDetector.java
        MatchingMagicMimeEntry.java
        MimeDetector.java
        OpendesktopMimeDetector.java
        WindowsRegistryMimeDetector.java
        handler
        TextMimeHandler.java
      - util
        EncodingGuesser.java
        StringUtil.java
        ZipJarUtil.java
- tests
  - com
    - delcyon
      - capo
        ProblemTests.java
        controller
        elements
        GroupElementTest.java
        ImportElementTest.java
        ParserElementTest.java
        ResourceElementTest.java
        RestartElementTest.java
        SnapshotElementTest.java
        SyncElementTest.java
        TaskElementTest.java
        crypto
        CertificateRequestProcessorTest.java
        datastream
        RegexFilterOutputStreamTest.java
        parsers
        GrammarParserTest.java
        TokenizerTest.java
        resourcemanager
        ResourceDescriptorTest.java
        ResourceURITest.java
        types
        ClientsResourceDescriptorTest.java
        FileResourceDescriptorTest.java
        HttpResourceDescriptorTest.java
        JcrResourceDescriptorTest.java
        JdbcResourceDescriptorTest.java
        RefResourceDescriptorTest.java
        ShellResourceDescriptorTest.java
        server
        CapoServerTest.java
        tests
        util
        ExternalTestClient.java
        ExternalTestServer.java
        TestCapoApplication.java
        TestClient.java
        TestServer.java
        Util.java
        external
        Util.java
        util
        TestInterface.java
        XMLSerializerTest.java
        XMLSerializerTestData.java
        diff
        DiffTest.java
        XMLDiffTest.java
        xml
        cdom
        CDocumentTest.java
        dom
        ResourceDocumentTest.java
        xsd
        SchemaDocumentTest.java

/**
Copyright (c) 2011 Delcyon, Inc.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
package com.delcyon.capo.util.diff;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;

/**
 * This will break up an InputStream according to a matching set of patterns. This should be usable for binary streams, but was originally written to handle text based streams.
 * The general contract is that you should create a StreamTokenizer with a custom ArrayList<ArrayList<Integer>> of match chars, or use one of the predefined tokenLists.
 * The maximum length of each list of integers is '8'. We use bit shifting for quick comparisons, and it is based on longs, which will start sliding off the end if we bit shift our bytes by more than 64 places, 8 for each byte   
 * You cannot construct this class with the CUSTOM TokenList. It is set automatically if you create w/ you own lists.
 * Once you have a instance, call readBytes() to get the next matching set of bytes from your stream that ends in a particular set of tokens (these will be included in the data you receive).
 * End of Streams are a little tricky, and will be available as the final getBytes(). 
 * This class does NOT close the InputStream.  
 * @author jeremiah
 *
 */
public class InputStreamTokenizer
{
	public enum TokenList
	{
		NEW_LINE(new char[][]{{'\n'},{'\r'},{'\r','\n'}}),
		WORD_BOUNDRY(new char[][]{{'\n'},{'\r'},{'\r','\n'},{'\t'},{' '}}),
		CUSTOM(new int[0][0]);
		
		private ArrayList<ArrayList<Integer>> tokenLists = null;
		
		private TokenList(char[][] tokensArray)
		{
			tokenLists = convertArrayIntoTokenLists(tokensArray);			
		}
		
		
		private TokenList(int[][] tokensArray)
		{
			tokenLists = convertArrayIntoTokenLists(tokensArray);
		}
		
		public ArrayList<ArrayList<Integer>> getTokenLists()
		{
			return tokenLists;
		}
		
	}
	
	/**
	 * Utility to convert a 2d array of ints into a 2d ArrayList of Integers 
	 * @param tokenListsArray
	 * @return
	 */
	public static ArrayList<ArrayList<Integer>> convertArrayIntoTokenLists(int[][] tokenListsArray)
	{
		ArrayList<ArrayList<Integer>> tokenLists = null;
		if (tokenListsArray.length != 0)
		{
			tokenLists = new ArrayList<ArrayList<Integer>>();
			for (int[] intTokenArray : tokenListsArray)
			{
				ArrayList<Integer> tokenArray = new ArrayList<Integer>();
				tokenLists.add(tokenArray);
				for (int i : intTokenArray)
				{
					tokenArray.add(i);
				}
			}
		}
		
		return tokenLists;
	}
	
	/**
	 * Utility to convert a 2d array of chars into a 2d ArrayList of Integers 
	 * @param tokenListsArray
	 * @return
	 */
	public static ArrayList<ArrayList<Integer>> convertArrayIntoTokenLists(char[][] tokenListsArray)
	{
		ArrayList<ArrayList<Integer>> tokenLists = null;
		if (tokenListsArray.length != 0)
		{
			tokenLists = new ArrayList<ArrayList<Integer>>();
			for (char[] charTokenArray : tokenListsArray)
			{
				ArrayList<Integer> tokenArray = new ArrayList<Integer>();
				tokenLists.add(tokenArray);
				for (int c : charTokenArray)
				{
					tokenArray.add((int)c);
				}
			}
		}
		
		return tokenLists;
	}
	
	
	private InputStream inputStream;	
	private int longestArraySize = 0;
	private int value = -1;
	private ByteArrayOutputStream buffer = new ByteArrayOutputStream();
	private long[][] matches = null;
	private TokenList tokenList;
	private ArrayList<ArrayList<Integer>> tokenLists;
	
	/** 
	 * @param inputStream
	 * @param tokenList (You cannot use CUSTOM here!) If you want a custom list, use the other constructor, and CUSTOM will be set for you.
	 * @throws Exception
	 */
	public InputStreamTokenizer(InputStream inputStream, TokenList tokenList) throws Exception
	{
		if (tokenList == TokenList.CUSTOM)
		{
			throw new Exception("InputStreamTokenizer cannot be created with CUSTOM tokenList. ");
		}
		else
		{
			this.tokenList = tokenList;
			init(inputStream,tokenList.getTokenLists());
		}
	}
	
	/**
	 * Effective wraps a byte array in a stream using ByteArrayInputStream.
	 * Convenience method
	 * @param data
	 * @param tokenLists
	 */
	public InputStreamTokenizer(byte[] data, TokenList tokenList) throws Exception
	{
		if (tokenList == TokenList.CUSTOM)
		{
			throw new Exception("InputStreamTokenizer cannot be created with CUSTOM tokenList. ");
		}
		else
		{
			this.tokenList = tokenList;
			init(new ByteArrayInputStream(data),tokenList.getTokenLists());
		}
	}
	
	/**
	 * 
	 * @param inputStream stream to break into 'lines'
	 * @param tokenLists char arrays represented by integers that signify a line break. The classic would be int[] lineBreak = new int[]{(int)'\n',(int)'\r'}; as well as individual '\r' and '\n'  
	 * Be cause our matching algorithm uses bit shifting of integers, the maximum length of a line break is 4 chars. 
	 */
	public InputStreamTokenizer(InputStream inputStream, ArrayList<ArrayList<Integer>> tokenLists)
	{
		this.tokenList = TokenList.CUSTOM;
		init(inputStream,tokenLists);
	}

	/**
	 * Effective wraps a byte array in a stream using ByteArrayInputStream.
	 * Convenience method
	 * @param data
	 * @param tokenLists
	 */
	public InputStreamTokenizer(byte[] data, ArrayList<ArrayList<Integer>> tokenLists)
	{
		this.tokenList = TokenList.CUSTOM;
		init(new ByteArrayInputStream(data),tokenLists);
	}
	
	public TokenList getTokenList()
	{
		return tokenList;
	}
	
	public ArrayList<ArrayList<Integer>> getTokenLists()
	{
		return tokenLists;
	}
	
	private void init(InputStream inputStream, ArrayList<ArrayList<Integer>> tokenLists)
	{
		this.inputStream = inputStream;
		this.tokenLists = tokenLists;
		//look through all of out line breaks and fine the maximum size for the read ahead limit 
		for (ArrayList<Integer> arrayList : tokenLists)
		{
			if (arrayList.size() > longestArraySize)
			{
				longestArraySize = arrayList.size();
			}
		}
		matches = new long[tokenLists.size()][longestArraySize];

		//populate our 2d array
		for (int lineBreakIndex = 0; lineBreakIndex < tokenLists.size(); lineBreakIndex++)
		{
			ArrayList<Integer> lineBreakArray = tokenLists.get(lineBreakIndex);
			for (int index = 0; index < lineBreakArray.size(); index++)
			{
				if (index == 0)
				{
					matches[lineBreakIndex][index] = lineBreakArray.get(index);
				}
				else
				{
					//for each char beyond the first char bit shift the previous value to the left 8 places and then and our current value
					//this lest us test for a full match against all of the values at once, by bit shifting all of our previously read values during a match hit.
					//this is a little funky, but it seems to work, and requires less cpu and memory
					matches[lineBreakIndex][index] = (matches[lineBreakIndex][index] << 8) & lineBreakArray.get(index).longValue();
				}
			}
		}
	}
	
	
	public byte[] readBytes() throws IOException
	{
		buffer.reset();
		int columnIndex = 0;
		boolean firstLoop = true;
		long compareValue = 0l;
		boolean foundMatch = false;
		while(true)
		{
			if (firstLoop == true && value != -1)
			{
				//do nothing, because we already have a value from the last readline call, and it's not the EOF
				//this also lets us use a value of -1 to for reading even if we have a previous value, when that value is a match value
				firstLoop = false;
			}
			else
			{
				value = inputStream.read();
				firstLoop = false;
			}
			
			if (value == -1)
			{
				break;
			}
			else
			{
				
				foundMatch = false;
				
				if (columnIndex == 0) //if we aren't in a match, then no strange bit math is needed
				{
					compareValue = (long)value;
				}
				else //otherwise we need to and our current value with the previously left shifted value that's still stored in the compare value. 
				{
					compareValue = compareValue & (long)value; //we can do a straight addition here, because we have already shifted things to the left by this point
				}
				
				for (long[] row : matches)
				{
					long matchValue = row[columnIndex];

					if (matchValue == 0l) //skip any uninitialized array entry
					{
						continue;
					}
					
					if (compareValue == matchValue) //found a match, so write it out, and increment things
					{
						compareValue = compareValue << 8; //slide compare value over for next iteration
						columnIndex++;
						buffer.write(value);
						foundMatch = true;
						break;
					}
				}
				
				if (foundMatch == false)
				{
					if (columnIndex > 0) //return the buffer if we don't have a match but we did have one and had incremented the column index
					{
						return buffer.toByteArray();
					}
					else //this isn't a match, and we haven't yet found one so just add it to the buffer
					{
						buffer.write(value);
					}
				}						
				else if (columnIndex == longestArraySize) //we can't find any more things to match on since we've reached the max length of searches 
				{
					value = -1; //since we'll only get here if we've just found a match, and we don't want to add it in next time we read a line
					return buffer.toByteArray();
				}
			}
		}
		
		/*		
		 * This is a bad idea, as we would have to indicate that we've modified the data, and as far as i can tell, it's just not worth it. 
		 * If we don't indicate modification, then the original document will not match that produced by patching from the diff due to an additional new line char at the end.
		 * So might as well take the original GNU route, and have people just add their own new lines if they care about it that much.
		 * //we've reached the end of the file w/o a match 
		if (value == -1 && buffer.size() != 0  && tokenList != TokenList.CUSTOM && columnIndex == 0)
		{			
			buffer.write((int)'\n');			
		}
		*/
		return buffer.toByteArray();
	}
}