package com.abmash.extraction;
import com.abmash.api.Browser;
import com.abmash.extraction.container.ExtractionContainer;
import com.abmash.extraction.container.TextExtractionContainer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WordExtractor extends FindExtractor {
public WordExtractor(Browser browser, SearchContainer searchContainer) {
super(browser, searchContainer);
}
@Override
/**
* extraction instances need to be added to the class variable extractions
*/
protected void extract() {
String text = parser.getVisibleText();
ArrayList<String> searchWords = searchContainer.getQueries();
for (String searchWord: searchWords) {
// find all matches
Pattern pattern = Pattern.compile("\\w*" + searchWord + "\\w*");
Matcher matcher = pattern.matcher(text);
// process matches and add extractions
while (matcher.find()) {
String wordMatch = matcher.group();
String exactMatch = wordMatch.substring(wordMatch.indexOf(searchWord), wordMatch.indexOf(searchWord) + searchWord.length());
// TODO windowed match
String expandedMatch =
text.substring(matcher.start() - 10, matcher.start()) +
"[" + exactMatch + "]" +
text.substring(matcher.end(), matcher.end() + 10)
;
TextExtractionContainer extraction = new TextExtractionContainer();
extraction.setWordMatch(wordMatch);
extraction.setExactMatch(exactMatch);
extraction.setExactMatchWithWindow(expandedMatch.trim().replace("\n", " "));
extractions.add(extraction);
}
}
}
@Override
protected String getExtractionOutput(ExtractionContainer extractionContainer) {
return ((TextExtractionContainer) extractionContainer).getWordMatch();
}
}