GoogleDocsTextConverter.java example

Explorer

Momus-master
- src
  - main
    - java
      - no
        dusken
        momus
        authentication
        UserDetailsService.java
        UserDetailsServiceImpl.java
        controller
        ArticleController.java
        DevController.java
        FavouriteSectionController.java
        NoteController.java
        PersonController.java
        PublicationController.java
        SourceController.java
        diff
        DiffMatchPatch.java
        DiffUtil.java
        TagToUnicodeConverter.java
        exceptions
        ExceptionHandler.java
        RestException.java
        ldap
        LdapSyncer.java
        mapper
        HibernateAwareObjectMapper.java
        model
        Article.java
        ArticleReview.java
        ArticleRevision.java
        ArticleStatus.java
        ArticleType.java
        FavouriteSection.java
        KeyValue.java
        LandingPage.java
        LayoutStatus.java
        Note.java
        Page.java
        Person.java
        Publication.java
        Section.java
        Source.java
        SourceTag.java
        service
        ArticleService.java
        FavouriteSectionService.java
        KeyValueService.java
        LandingPageService.java
        NoteService.java
        PublicationService.java
        SourceService.java
        drive
        GoogleDocsTextConverter.java
        GoogleDriveService.java
        indesign
        IndesignExport.java
        IndesignGenerator.java
        repository
        ArticleRepository.java
        ArticleReviewRepository.java
        ArticleRevisionRepository.java
        ArticleStatusRepository.java
        ArticleTypeRepository.java
        FavouriteSectionRepository.java
        KeyValueRepository.java
        LandingPageRepository.java
        LayoutStatusRepository.java
        NoteRepository.java
        PageRepository.java
        PersonRepository.java
        PublicationRepository.java
        SectionRepository.java
        SourceRepository.java
        SourceTagRepository.java
        search
        ArticleQueryBuilder.java
        ArticleSearchParams.java
  - test
    - java
      - no
        dusken
        momus
        authentication
        UserDetailsServiceMock.java
        service
        ArticleServiceTest.java
        KeyValueServiceTest.java
        PublicationServiceTest.java
        SourceServiceTest.java
        drive
        GoogleDocsTextConverterTest.java
        indesign
        IndesignGeneratorTest.java
        search
        ArticleQueryBuilderTest.java
        test
        AbstractTestRunner.java

/*
 * Copyright 2016 Studentmediene i Trondheim AS
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package no.dusken.momus.service.drive;

import org.apache.commons.lang3.StringEscapeUtils;
import org.springframework.stereotype.Service;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Converts content from a Google Drive Document
 * to our representation
 */
@Service
public class GoogleDocsTextConverter {

    Pattern body = Pattern.compile("<body.*?>(.*)</body>");
    Pattern css = Pattern.compile("<style type=\"text/css\">(.*)</style>");
    Pattern italicStyleName = Pattern.compile("\\.([^{]*?)\\{font-style:italic\\}");
    Pattern boldStyleName = Pattern.compile("\\.([^{]*?)\\{font-weight:bold\\}");


    Pattern aTags = Pattern.compile("<a[^>]*?></a>");
    Pattern classes = Pattern.compile(" class=\".*?\"");
    Pattern spans = Pattern.compile("</?span.*?>");
    Pattern emptyP = Pattern.compile("<p>\\s?</p>");

    ArrayList<Pattern> hTags = initTitles();

    Pattern dashes = Pattern.compile("--");
    Pattern inlineComments = Pattern.compile("<sup>.*?</sup>");
    Pattern spaces = Pattern.compile(" ");
    Pattern comments = Pattern.compile("<div><p>.*?</p></div>");

    Pattern lists = Pattern.compile(" start=\".*?\"");

    Pattern table = Pattern.compile("<table[^>]*?>.*?</table>");
    Pattern img = Pattern.compile("<img.*?>");


    String ltUnicode = Character.toString((char) 44000);
    String gtUnicode = Character.toString((char) 44001);
    Pattern ltToUnicode = Pattern.compile("<");
    Pattern gtToUnicode = Pattern.compile(">");
    Pattern unicodeToLt = Pattern.compile(ltUnicode);
    Pattern unicodeToGt = Pattern.compile(gtUnicode);

    public String convert(String input) {
        String body = extractBody(input);
        String css = extractCss(input);

        String out;

        out = findItalicsAndBold(body, css);

        out = removeEmptyATags(out);
        out = removeClasses(out);
        out = removeSpans(out);
        out = removeComments(out);
        out = removeInvalidContent(out);
        out = removeListAttributes(out);
        out = removeEmptyPTags(out);
        out = unescapeHtml(out);
        out = replaceTitles(out);
        out = replaceDashes(out);

        return out;
    }

    /**
     * Only interested in the stuff inside <body></body>
     */
    private String extractBody(String in) {
        Matcher m = body.matcher(in);

        if (m.find()) {
            return m.group(1);
        }
        return in;
    }

    private String extractCss(String in) {
        Matcher m = css.matcher(in);

        if (m.find()) {
            return m.group(1);
        }
        return  in;
    }

    private ArrayList<Pattern> initTitles(){
        ArrayList<Pattern> hTags = new ArrayList<Pattern>();
        for(int i=1; i<5; i++){
            hTags.add(Pattern.compile("<h" + i + " [^>]*>"));
        }
        return hTags;
    }

    /**
     * Bold and italics are not marked with tags in GDocs, instead it is applied with CSS.
     * For instance:
     * .c1{font-weight:bold}
     * lalala <span class="c1">bold</span>
     *
     * The classnames change each time, so need to dynamicall find it and change the span to <i> or <b>
     */
    private String findItalicsAndBold(String body, String css) {
        Matcher italicsMatcher = italicStyleName.matcher(css);
        Matcher boldMatcher = boldStyleName.matcher(css);

        if (italicsMatcher.find()) {
            String italicSelectorName = italicsMatcher.group(1);

            Pattern italicClasses = Pattern.compile("<span class=\"" + italicSelectorName + "\">(.*?)</span>");
            Matcher spanMatcherItalics = italicClasses.matcher(body);

            body = spanMatcherItalics.replaceAll("<i>$1</i>"); // $1 means what is matched inside the parentheses in the pattern
        }


        if (boldMatcher.find()) {
            String boldSelectorName = boldMatcher.group(1);

            Pattern boldClasses = Pattern.compile("<span class=\"" + boldSelectorName + "\">(.*?)</span>");
            Matcher spanMatcherBold = boldClasses.matcher(body);

            body = spanMatcherBold.replaceAll("<b>$1</b>");
        }


        return body;
    }

    /**
     * Remove <a name=*></a> stuff google inserts everywhere
     */
    private String removeEmptyATags(String in) {
        Matcher m = aTags.matcher(in);
        return m.replaceAll("");
    }

    private String removeClasses(String in) {
        Matcher m = classes.matcher(in);
        return m.replaceAll("");
    }

    private String removeSpans(String in) {
        Matcher m = spans.matcher(in);
        return m.replaceAll("");
    }

    /**
     * Removing ids from header tags
     */
    private String replaceTitles(String in){
        String out = in;
        for(int i = 0; i < 4; i++){
            Matcher m = hTags.get(i).matcher(out);
            out = m.replaceAll("<h" + (i + 1) + ">");
        }
        return out;
    }

    /**
     * In case someone likes to have much space between their paragraphs..
     */
    private String removeEmptyPTags(String in) {
        Matcher m = emptyP.matcher(in);
        return m.replaceAll("");
    }

    /**
     * Comments inserted should be removed as they don't belong to the text
     * A comment adds a <sup>-reference to the text, and then the comment
     * itself at the bottom
     */
    private String removeComments(String in) {
        Matcher m = inlineComments.matcher(in);
        String out = m.replaceAll("");

        // Spaces inside a marked text are written as  
        m = spaces.matcher(out);
        out = m.replaceAll(" ");

        m = comments.matcher(out);
        out = m.replaceAll("");

        return out;
    }

    /**
     * Removes some stuff from the lists
     */
    private String removeListAttributes(String in) {
        Matcher m = lists.matcher(in);
        return m.replaceAll("");
    }

    /**
     * Removes images and tables, should possibly remove more stuff
     * but try to keep the contents, not just the formatting.
     */
    private String removeInvalidContent(String in) {
        Matcher m = table.matcher(in);
        String out = m.replaceAll("");

        m = img.matcher(out);
        out = m.replaceAll(" ");

        return out;
    }

    /*
     * Replaces "--" with an en-dash
     */
    private String replaceDashes(String in) {
        Matcher m = dashes.matcher(in);
        return m.replaceAll("–");
    }

    /**
     * Converts HTML entities to "normal characters", for instance
     * it converts å to å
     *
     * But < (<) and > (>) are ignored, to avoid < and > in the written
     * text to affect our HTML.
     */
    private String unescapeHtml(String in) {
        // replace all > and <
        Matcher m = ltToUnicode.matcher(in);
        String out = m.replaceAll(ltUnicode);

        m = gtToUnicode.matcher(out);
        out = m.replaceAll(gtUnicode);

        //Convert quotes to "guillemets"
        out = out.replaceAll("“","«");
        out = out.replaceAll("”","»");

        // convert stuff
        out = StringEscapeUtils.unescapeHtml4(out);

        // add the > and <s back
        m = unicodeToLt.matcher(out);
        out = m.replaceAll("<");

        m = unicodeToGt.matcher(out);
        out = m.replaceAll(">");

        return out;
    }
}