package com.datumbox.framework.utilities.text.cleaners;

import com.datumbox.common.utilities.PHPfunctions;
import java.text.Normalizer;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

/* loaded from: input_file:com/datumbox/framework/utilities/text/cleaners/StringCleaner.class */
public class StringCleaner {
    public static final String TOKENIZED_URL = " PREPROCESSDOC_URL ";
    private static final Pattern URL_PATTERN = Pattern.compile("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", 98);
    public static final Map<String, String> smileys = new HashMap();

    public static String tokenizeURLs(String str) {
        return PHPfunctions.preg_replace(URL_PATTERN, TOKENIZED_URL, str);
    }

    public static String tokenizeSmileys(String str) {
        for (Map.Entry<String, String> entry : smileys.entrySet()) {
            str = str.replaceAll(entry.getKey(), entry.getValue());
        }
        return str;
    }

    public static String removeExtraSpaces(String str) {
        return str.trim().replaceAll("\\s+", " ");
    }

    public static String removeSymbols(String str) {
        return str.replaceAll("[^\\p{L}\\p{Z}_]", "");
    }

    public static String unifyTerminators(String str) {
        return str.replaceAll("[\",:;()\\-]+", " ").replaceAll("[\\.!?]", ".").replaceAll("\\.[\\. ]+", ".").replaceAll("\\s*\\.\\s*", ". ").trim();
    }

    public static String removeAccents(String str) {
        return Normalizer.normalize(str, Normalizer.Form.NFD).replaceAll("[\\p{InCombiningDiacriticalMarks}]", "");
    }

    public static String clear(String str) {
        return removeExtraSpaces(removeSymbols(removeAccents(tokenizeSmileys(tokenizeURLs(str))))).toLowerCase();
    }

    static {
        smileys.put(":\\)", " PREPROCESSDOC_EM1 ");
        smileys.put(":-\\)", " PREPROCESSDOC_EM2 ");
        smileys.put(":\\(", " PREPROCESSDOC_EM3 ");
        smileys.put(":-\\(", " PREPROCESSDOC_EM4 ");
        smileys.put(":d", " PREPROCESSDOC_EM5 ");
        smileys.put(";\\)", " PREPROCESSDOC_EM6 ");
        smileys.put(":o\\)", " PREPROCESSDOC_EM7 ");
        smileys.put(":\\]", " PREPROCESSDOC_EM8 ");
        smileys.put(":\\[", " PREPROCESSDOC_EM9 ");
        smileys.put(":p", " PREPROCESSDO,C_EM10 ");
        smileys.put(":-p", " PREPROCESSDOC_EM11 ");
        smileys.put("8-\\)", " PREPROCESSDOC_EM12 ");
        smileys.put("=\\)", " PREPROCESSDOC_EM13 ");
        smileys.put("=\\(", " PREPROCESSDOC_EM14 ");
    }
}
