package com.andrada.sitracker.reader;
import com.andrada.sitracker.Constants;
import com.andrada.sitracker.db.beans.SearchedAuthor;
import org.jetbrains.annotations.NotNull;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SamlibAuthorSearchReader implements AuthorSearchReader {
private static final String BASE_URL = "http://samlib.ru";
@Override
@NotNull
public Collection<SearchedAuthor> getUniqueAuthorsFromPage(String pageContent) {
Map<String, SearchedAuthor> authors = new HashMap<String, SearchedAuthor>();
Pattern pattern = Pattern.compile(Constants.SAMLIB_AUTHOR_SEARCH_REGEX, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(pageContent);
while (matcher.find()) {
String authorUrl = matcher.group(1) == null ? "" : matcher.group(1);
if (authorUrl.equals("")) {
continue;
}
authorUrl = this.normalizeUrl(authorUrl);
String authorName = matcher.group(2) == null ? "" : matcher.group(2).trim();
String descr = matcher.group(3) == null ? "" : matcher.group(3).trim();
SearchedAuthor auth = new SearchedAuthor(authorUrl, authorName, descr);
if (!authors.containsKey(authorUrl)) {
authors.put(authorUrl, auth);
} else {
authors.get(authorUrl).recordSearchHit();
}
}
return authors.values();
}
private String normalizeUrl(String value) {
if (value.startsWith("/")) {
value = BASE_URL + value;
} else {
value = BASE_URL + "/" + value;
}
if (value.endsWith("/")) {
value = value + Constants.AUTHOR_PAGE_URL_ENDING_WO_SLASH;
} else {
value = value + Constants.AUTHOR_PAGE_URL_ENDING_WI_SLASH;
}
return value;
}
}