/* * Copyright (c) 2015-2015 Vladimir Schneider <vladimir.schneider@gmail.com> * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * * This file is based on the IntelliJ SimplePlugin tutorial * */ package com.vladsch.idea.multimarkdown.language; import com.intellij.lang.cacheBuilder.VersionedWordsScanner; import com.intellij.lang.cacheBuilder.WordOccurrence; import com.intellij.lexer.Lexer; import com.intellij.psi.tree.IElementType; import com.intellij.psi.tree.TokenSet; import com.intellij.util.Processor; import org.apache.log4j.Logger; import org.jetbrains.annotations.NotNull; /** * The default implementation of a words scanner based on a custom language lexer. * * @author max */ public class MultiMarkdownWordsScanner extends VersionedWordsScanner { private static final Logger logger = Logger.getLogger(MultiMarkdownWordsScanner.class); private final Lexer myLexer; private final TokenSet myIdentifierTokenSet; private final TokenSet myCommentTokenSet; private final TokenSet myLiteralTokenSet; private final TokenSet mySkipCodeContextTokenSet; private boolean myMayHaveFileRefsInLiterals; private boolean myKeepCodeTokensWhole; private boolean myUseSpaceBreaks; private WordOccurrence.Kind myDefaultKind; final private int myVersion; /** * Creates a new instance of the words scanner. * * @param lexer the lexer used for breaking the text into tokens. * @param identifierTokenSet the set of token types which represent identifiers. * @param commentTokenSet the set of token types which represent comments. * @param literalTokenSet the set of token types which represent literals. */ public MultiMarkdownWordsScanner(final Lexer lexer, final TokenSet identifierTokenSet, final TokenSet commentTokenSet, final TokenSet literalTokenSet, int version) { this(lexer, identifierTokenSet, commentTokenSet, literalTokenSet, TokenSet.EMPTY, version); } /** * Creates a new instance of the words scanner. * * @param lexer the lexer used for breaking the text into tokens. * @param identifierTokenSet the set of token types which represent identifiers. * @param commentTokenSet the set of token types which represent comments. * @param literalTokenSet the set of token types which represent literals. * @param skipCodeContextTokenSet the set of token types which should not be considered as code context. */ public MultiMarkdownWordsScanner(final Lexer lexer, final TokenSet identifierTokenSet, final TokenSet commentTokenSet, final TokenSet literalTokenSet, @NotNull TokenSet skipCodeContextTokenSet, int version) { myLexer = lexer; myIdentifierTokenSet = identifierTokenSet; myCommentTokenSet = commentTokenSet; myLiteralTokenSet = literalTokenSet; mySkipCodeContextTokenSet = skipCodeContextTokenSet; myDefaultKind = WordOccurrence.Kind.CODE; myVersion = version; } @Override public int getVersion() { return myVersion; } public void processWords(CharSequence fileText, Processor<WordOccurrence> processor) { myLexer.start(fileText); WordOccurrence occurrence = new WordOccurrence(fileText, 0, 0, null); // shared occurrence IElementType type; while ((type = myLexer.getTokenType()) != null) { if (myIdentifierTokenSet.contains(type)) { if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.CODE, occurrence, false, myKeepCodeTokensWhole, myUseSpaceBreaks)) return; } else if (myCommentTokenSet.contains(type)) { if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.COMMENTS, occurrence, false, false, myUseSpaceBreaks)) return; } else if (myLiteralTokenSet.contains(type)) { if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), WordOccurrence.Kind.LITERALS, occurrence, myMayHaveFileRefsInLiterals, false, myUseSpaceBreaks)) return; } else if (myDefaultKind != null && !mySkipCodeContextTokenSet.contains(type)) { if (!stripWords(processor, fileText, myLexer.getTokenStart(), myLexer.getTokenEnd(), myDefaultKind, occurrence, false, myDefaultKind == WordOccurrence.Kind.CODE && myKeepCodeTokensWhole, myUseSpaceBreaks)) return; } myLexer.advance(); } } protected static boolean stripWords(final Processor<WordOccurrence> processor, final CharSequence tokenText, int from, int to, final WordOccurrence.Kind kind, @NotNull WordOccurrence occurrence, boolean mayHaveFileRefs, boolean keepTokensWhole, boolean useSpaceBreaks ) { // This code seems strange but it is more effective as Character.isJavaIdentifier_xxx_ is quite costly operation due to unicode int index = from; StringBuilder tokens = new StringBuilder(to - from + 100); StringBuilder foreign = new StringBuilder(to - from + 100); try { if (keepTokensWhole) { tokens.append('\''); tokens.append(tokenText.subSequence(from, to)); tokens.append('\''); tokens.append(' '); occurrence.init(tokenText, from, to, kind); if (!processor.process(occurrence)) return false; if (mayHaveFileRefs) { foreign.append('\''); foreign.append(tokenText.subSequence(from, to)); foreign.append('\''); foreign.append(' '); occurrence.init(tokenText, from, to, WordOccurrence.Kind.FOREIGN_LANGUAGE); if (!processor.process(occurrence)) return false; } } else { ScanWordsLoop: while (true) { while (true) { if (index == to) break ScanWordsLoop; char c = tokenText.charAt(index); if ((useSpaceBreaks && c != ' ' && c != '\n' && c != '\t') || isAsciiIdentifierPart(c) || Character.isJavaIdentifierStart(c)) { break; } index++; } int wordStart = index; while (true) { index++; if (index == to) break; char c = tokenText.charAt(index); if (c == ' ' || c == '\n' || c == '\t') break; if (useSpaceBreaks || isAsciiIdentifierPart(c)) continue; if (!Character.isJavaIdentifierPart(c)) break; } int wordEnd = index; tokens.append('\''); tokens.append(tokenText.subSequence(wordStart, wordEnd)); tokens.append('\''); tokens.append(' '); occurrence.init(tokenText, wordStart, wordEnd, kind); if (!processor.process(occurrence)) return false; if (mayHaveFileRefs) { foreign.append('\''); foreign.append(tokenText.subSequence(wordStart, wordEnd)); foreign.append('\''); foreign.append(' '); occurrence.init(tokenText, wordStart, wordEnd, WordOccurrence.Kind.FOREIGN_LANGUAGE); if (!processor.process(occurrence)) return false; } } } return true; } finally { if (tokens.length() > 0) { logger.info(kind.toString() + ": " + tokens.subSequence(0, Math.min(tokens.length(), 100)).toString()); } else { logger.info(" no " + kind.toString() + ": " + "tokens in " + tokenText.subSequence(from, Math.min(to, from + 100))); } if (foreign.length() > 0) { logger.info(kind.toString() + ": " + tokens.subSequence(0, Math.min(foreign.length(), 100)).toString()); } } } private static boolean isAsciiIdentifierPart(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$'; } public void setMayHaveFileRefsInLiterals(final boolean mayHaveFileRefsInLiterals) { myMayHaveFileRefsInLiterals = mayHaveFileRefsInLiterals; } public void setKeepCodeTokensWhole(boolean keepCodeTokensWhole) { this.myKeepCodeTokensWhole = keepCodeTokensWhole; } public void setUseSpaceBreaks(boolean useSpaceBreaks) { this.myUseSpaceBreaks = useSpaceBreaks; } public void setDefaultKind(WordOccurrence.Kind defaultKind) { this.myDefaultKind = defaultKind; } }