MultiMarkdownWordsScanner.java example

Explorer

idea-multimarkdown-master
- src
  - main
    - java
      - com
        vladsch
        idea
        multimarkdown
        MultiMarkdownBundle.java
        MultiMarkdownFileIconProvider.java
        MultiMarkdownFileType.java
        MultiMarkdownFileTypeFactory.java
        MultiMarkdownIcons.java
        MultiMarkdownLanguage.java
        MultiMarkdownProjectComponent.java
        annotator
        AnnotationState.java
        ChangeLinkRefQuickFix.java
        CreateFileQuickFix.java
        DeleteWikiPageRefQuickFix.java
        DeleteWikiPageTitleQuickFix.java
        MultiMarkdownAnnotator.java
        RenameFileAndReTargetQuickFix.java
        RenameFileQuickFix.java
        SwapWikiPageRefTitleQuickFix.java
        documentation
        MultiMarkdownDocumentationProvider.java
        editor
        MultiMarkdownEditorKit.java
        MultiMarkdownFxHtmlEditorProvider.java
        MultiMarkdownFxLinkRenderer.java
        MultiMarkdownFxPreviewEditor.java
        MultiMarkdownFxPreviewEditorProvider.java
        MultiMarkdownHtmlEditorProvider.java
        MultiMarkdownLinkListener.java
        MultiMarkdownLinkRenderer.java
        MultiMarkdownPathResolver.java
        MultiMarkdownPreviewEditor.java
        MultiMarkdownPreviewEditorProvider.java
        MultiMarkdownStyleSheet.java
        MultiMarkdownToHtmlSerializer.java
        highlighter
        MultiMarkdownColorSettingsPage.java
        MultiMarkdownHighlighterColors.java
        MultiMarkdownSyntaxHighlighter.java
        MultiMarkdownSyntaxHighlighterFactory.java
        language
        ElementNameSuggestionProvider.java
        MultiMarkdownChooseByNameContributor.java
        MultiMarkdownCodeStyleSettings.java
        MultiMarkdownCodeStyleSettingsProvider.java
        MultiMarkdownCommenter.java
        MultiMarkdownCompletionContributor.java
        MultiMarkdownFindUsagesProvider.java
        MultiMarkdownLanguageCodeStyleSettingsProvider.java
        MultiMarkdownLineMarkerProvider.java
        MultiMarkdownNamesValidator.java
        MultiMarkdownRefactoringSupportProvider.java
        MultiMarkdownReferenceContributor.java
        MultiMarkdownReferenceSearch.java
        MultiMarkdownUtil.java
        MultiMarkdownWordsScanner.java
        TestRenameHandler.java
        parser
        ErrorNode.java
        MultiMarkdownLexParser.java
        MultiMarkdownLexParserManager.java
        MultiMarkdownLexer.java
        MultiMarkdownLightParser.java
        MultiMarkdownParser.java
        MultiMarkdownParserDefinition.java
        MultiMarkdownPlainTextLexer.java
        psi
        MultiMarkdownElementFactory.java
        MultiMarkdownElementType.java
        MultiMarkdownFile.java
        MultiMarkdownLinkElement.java
        MultiMarkdownLinkRefElement.java
        MultiMarkdownNamedElement.java
        MultiMarkdownTokenType.java
        MultiMarkdownTokenTypeSets.java
        MultiMarkdownTypes.java
        MultiMarkdownVisitor.java
        MultiMarkdownWikiLink.java
        MultiMarkdownWikiLinkAnchor.java
        MultiMarkdownWikiLinkRef.java
        MultiMarkdownWikiLinkText.java
        impl
        MultiMarkdownCommentImpl.java
        MultiMarkdownLinkElementImpl.java
        MultiMarkdownNamedElementImpl.java
        MultiMarkdownNamedElementManipulator.java
        MultiMarkdownPsiImplUtil.java
        MultiMarkdownReference.java
        MultiMarkdownReferenceWikiLinkRef.java
        MultiMarkdownWikiLinkAnchorImpl.java
        MultiMarkdownWikiLinkImpl.java
        MultiMarkdownWikiLinkRefImpl.java
        MultiMarkdownWikiLinkTextImpl.java
        settings
        ComponentProvider.java
        ComponentState.java
        CustomizableEditorTextField.java
        CustomizableLanguageEditorTextField.java
        FailedBuildRunnable.java
        MultiMarkdownGlobalSettings.java
        MultiMarkdownGlobalSettingsConfigurable.java
        MultiMarkdownGlobalSettingsListener.java
        MultiMarkdownNonRoamingGlobalSettings.java
        MultiMarkdownSettingsPanel.java
        SettingGroupHandler.java
        SettingHandler.java
        SettingHandlers.java
        Settings.java
        SettingsListener.java
        SettingsNotifier.java
        SettingsPanel.java
        SettingsPanelImpl.java
        SettingsUpdater.java
        spellchecking
        MultiMarkdownIdentifierTokenizer.java
        MultiMarkdownSpellcheckingStrategy.java
        Suggestion.java
        SuggestionFixers.java
        SuggestionList.java
        SuggestionListFail.java
        TypoRenameToQuickFix.java
        todo
        MultiMarkdownFilterLexer.java
        MultiMarkdownIdIndexer.java
        MultiMarkdownTodoIndexer.java
        util
        GitHubVcsRoot.java
        ListenerNotifier.java
        ReferenceChangeListener.java
  - test
    - java
      - com
        vladsch
        idea
        multimarkdown
        OrderedComparisonCriteria.java
        TestSuite.java
        TestUtils.java
        TypedComparisonCriteria.java
        UnorderedComparisonCriteria.java
        spellchecking
        SpellcheckingTestSuite.java
        SuggestionFixersTestSuite.java
        SuggestionTestSuite.java
        TestSuggestionBasic.java
        TestSuggestionFixersBasic.java
        TestSuggestionFixersFixSuggestion.java
        TestSuggestionFixersIsAlphaNum.java
        TestSuggestionListBasic.java
        TestSuggestionListFixers.java
        util
        UtilTestSuite.java

/*
 * Copyright (c) 2015-2015 Vladimir Schneider <vladimir.schneider@gmail.com>
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 * This file is based on the IntelliJ SimplePlugin tutorial
 *
 */
package com.vladsch.idea.multimarkdown.language;

import com.intellij.lang.cacheBuilder.VersionedWordsScanner;
import com.intellij.lang.cacheBuilder.WordOccurrence;
import com.intellij.lexer.Lexer;
import com.intellij.psi.tree.IElementType;
import com.intellij.psi.tree.TokenSet;
import com.intellij.util.Processor;
import org.apache.log4j.Logger;
import org.jetbrains.annotations.NotNull;

/**
 * The default implementation of a words scanner based on a custom language lexer.
 *
 * @author max
 */

public class MultiMarkdownWordsScanner extends VersionedWordsScanner {
    private static final Logger logger = Logger.getLogger(MultiMarkdownWordsScanner.class);

    private final Lexer myLexer;
    private final TokenSet myIdentifierTokenSet;
    private final TokenSet myCommentTokenSet;
    private final TokenSet myLiteralTokenSet;
    private final TokenSet mySkipCodeContextTokenSet;
    private boolean myMayHaveFileRefsInLiterals;
    private boolean myKeepCodeTokensWhole;
    private boolean myUseSpaceBreaks;
    private WordOccurrence.Kind myDefaultKind;
    final private int myVersion;

    /**
     * Creates a new instance of the words scanner.
     *
     * @param lexer              the lexer used for breaking the text into tokens.
     * @param identifierTokenSet the set of token types which represent identifiers.
     * @param commentTokenSet    the set of token types which represent comments.
     * @param literalTokenSet    the set of token types which represent literals.
     */
    public MultiMarkdownWordsScanner(final Lexer lexer, final TokenSet identifierTokenSet, final TokenSet commentTokenSet,
            final TokenSet literalTokenSet, int version) {
        this(lexer, identifierTokenSet, commentTokenSet, literalTokenSet, TokenSet.EMPTY, version);
    }

    /**
     * Creates a new instance of the words scanner.
     *
     * @param lexer                   the lexer used for breaking the text into tokens.
     * @param identifierTokenSet      the set of token types which represent identifiers.
     * @param commentTokenSet         the set of token types which represent comments.
     * @param literalTokenSet         the set of token types which represent literals.
     * @param skipCodeContextTokenSet the set of token types which should not be considered as code context.
     */
    public MultiMarkdownWordsScanner(final Lexer lexer, final TokenSet identifierTokenSet, final TokenSet commentTokenSet,
            final TokenSet literalTokenSet, @NotNull TokenSet skipCodeContextTokenSet, int version) {
        myLexer = lexer;
        myIdentifierTokenSet = identifierTokenSet;
        myCommentTokenSet = commentTokenSet;
        myLiteralTokenSet = literalTokenSet;
        mySkipCodeContextTokenSet = skipCodeContextTokenSet;
        myDefaultKind = WordOccurrence.Kind.CODE;
        myVersion = version;
    }

    @Override public int getVersion() {
        return myVersion;
    }

    public void processWords(CharSequence fileText, Processor<WordOccurrence> processor) {
        myLexer.start(fileText);
        WordOccurrence occurrence = new WordOccurrence(fileText, 0, 0, null); // shared occurrence

        IElementType type;
        while ((type = myLexer.getTokenType()) != null) {
            if (myIdentifierTokenSet.contains(type)) {
                if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.CODE, occurrence, false, myKeepCodeTokensWhole, myUseSpaceBreaks)) return;
            } else if (myCommentTokenSet.contains(type)) {
                if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.COMMENTS, occurrence, false, false, myUseSpaceBreaks)) return;
            } else if (myLiteralTokenSet.contains(type)) {
                if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.LITERALS, occurrence, myMayHaveFileRefsInLiterals, false, myUseSpaceBreaks)) return;
            } else if (myDefaultKind != null && !mySkipCodeContextTokenSet.contains(type)) {
                if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), myDefaultKind, occurrence, false, myDefaultKind == WordOccurrence.Kind.CODE && myKeepCodeTokensWhole, myUseSpaceBreaks)) return;
            }
            myLexer.advance();
        }
    }

    protected static boolean stripWords(final Processor<WordOccurrence> processor,
            final CharSequence tokenText,
            int from,
            int to,
            final WordOccurrence.Kind kind,
            @NotNull WordOccurrence occurrence,
            boolean mayHaveFileRefs,
            boolean keepTokensWhole,
            boolean useSpaceBreaks
    ) {
        // This code seems strange but it is more effective as Character.isJavaIdentifier_xxx_ is quite costly operation due to unicode
        int index = from;
        StringBuilder tokens = new StringBuilder(to - from + 100);
        StringBuilder foreign = new StringBuilder(to - from + 100);
        try {
            if (keepTokensWhole) {
                tokens.append('\'');
                tokens.append(tokenText.subSequence(from, to));
                tokens.append('\'');
                tokens.append(' ');
                occurrence.init(tokenText, from, to, kind);
                if (!processor.process(occurrence)) return false;

                if (mayHaveFileRefs) {
                    foreign.append('\'');
                    foreign.append(tokenText.subSequence(from, to));
                    foreign.append('\'');
                    foreign.append(' ');
                    occurrence.init(tokenText, from, to, WordOccurrence.Kind.FOREIGN_LANGUAGE);
                    if (!processor.process(occurrence)) return false;
                }
            } else {
ScanWordsLoop:
                while (true) {
                    while (true) {
                        if (index == to) break ScanWordsLoop;
                        char c = tokenText.charAt(index);
                        if ((useSpaceBreaks && c != ' ' && c != '\n' && c != '\t') || isAsciiIdentifierPart(c) || Character.isJavaIdentifierStart(c)) {
                            break;
                        }
                        index++;
                    }
                    int wordStart = index;
                    while (true) {
                        index++;
                        if (index == to) break;
                        char c = tokenText.charAt(index);
                        if (c == ' ' || c == '\n' || c == '\t') break;
                        if (useSpaceBreaks || isAsciiIdentifierPart(c)) continue;
                        if (!Character.isJavaIdentifierPart(c)) break;
                    }
                    int wordEnd = index;

                    tokens.append('\'');
                    tokens.append(tokenText.subSequence(wordStart, wordEnd));
                    tokens.append('\'');
                    tokens.append(' ');
                    occurrence.init(tokenText, wordStart, wordEnd, kind);
                    if (!processor.process(occurrence)) return false;

                    if (mayHaveFileRefs) {
                        foreign.append('\'');
                        foreign.append(tokenText.subSequence(wordStart, wordEnd));
                        foreign.append('\'');
                        foreign.append(' ');
                        occurrence.init(tokenText, wordStart, wordEnd, WordOccurrence.Kind.FOREIGN_LANGUAGE);
                        if (!processor.process(occurrence)) return false;
                    }
                }
            }
            return true;
        } finally {
            if (tokens.length() > 0) {
                logger.info(kind.toString() + ": " + tokens.subSequence(0, Math.min(tokens.length(), 100)).toString());
            } else {
                logger.info(" no " + kind.toString() + ": " + "tokens in " + tokenText.subSequence(from, Math.min(to, from + 100)));
            }

            if (foreign.length() > 0) {
                logger.info(kind.toString() + ": " + tokens.subSequence(0, Math.min(foreign.length(), 100)).toString());
            }
        }
    }

    private static boolean isAsciiIdentifierPart(char c) {
        return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$';
    }

    public void setMayHaveFileRefsInLiterals(final boolean mayHaveFileRefsInLiterals) {
        myMayHaveFileRefsInLiterals = mayHaveFileRefsInLiterals;
    }

    public void setKeepCodeTokensWhole(boolean keepCodeTokensWhole) {
        this.myKeepCodeTokensWhole = keepCodeTokensWhole;
    }

    public void setUseSpaceBreaks(boolean useSpaceBreaks) {
        this.myUseSpaceBreaks = useSpaceBreaks;
    }

    public void setDefaultKind(WordOccurrence.Kind defaultKind) {
        this.myDefaultKind = defaultKind;
    }
}