package org.talend.dataquality.nlp;

import java.io.Serializable;
import java.util.regex.Pattern;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.jsoup.Jsoup;

/* loaded from: input_file:org/talend/dataquality/nlp/SentencePreprocessor.class */
public class SentencePreprocessor implements Serializable {
    private static final long serialVersionUID = 1;

    private SentencePreprocessor() {
    }

    public static JavaRDD<String> removePunctuation(JavaRDD<String> javaRDD) {
        final Pattern compile = Pattern.compile("\\p{Punct}+");
        return javaRDD.map(new Function<String, String>() { // from class: org.talend.dataquality.nlp.SentencePreprocessor.1
            private static final long serialVersionUID = 1;

            public String call(String str) {
                return compile.matcher(str).replaceAll(" ");
            }
        });
    }

    public static JavaRDD<String> toLowercase(JavaRDD<String> javaRDD) {
        return javaRDD.map(new Function<String, String>() { // from class: org.talend.dataquality.nlp.SentencePreprocessor.2
            private static final long serialVersionUID = 1;

            public String call(String str) {
                return str.toLowerCase();
            }
        });
    }

    public static JavaRDD<String> cleanTags(JavaRDD<String> javaRDD) {
        return javaRDD.map(new Function<String, String>() { // from class: org.talend.dataquality.nlp.SentencePreprocessor.3
            private static final long serialVersionUID = 1;

            public String call(String str) {
                return Jsoup.parse(str).text();
            }
        });
    }
}
