package com.datumbox.framework.utilities.text.cleaners;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringEscapeUtils;

/* loaded from: input_file:com/datumbox/framework/utilities/text/cleaners/HTMLCleaner.class */
public class HTMLCleaner {
    private static final Pattern IMG_ALT_TITLE_PATTERN = Pattern.compile("<[\\s]*img[^>]*[alt|title]=[\\s]*[\\\"']?([^>\\\"']+)[\\\"']?[^>]*>", 34);
    private static final Pattern NON_TEXT_TAGS_PATTERN = Pattern.compile("<[\\s]*(head|style|script|object|embed|applet|noframes|noscript|noembed|option)[^>]*?>.*?</\\1>", 34);
    private static final Pattern REMOVE_ATTRIBUTES_PATTERN = Pattern.compile("<([a-z!][a-z0-9]*)[^>]*?(/?)>", 34);
    private static final Pattern TITLE_PATTERN = Pattern.compile("<title[^>]*>(.*?)</title>", 34);
    private static final Pattern HYPERLINK_PATTERN = Pattern.compile("<[\\s]*a[^>]*href[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*>(.*?)</a>", 34);
    private static final Pattern METATAG_PATTERN = Pattern.compile("<[\\s]*meta[^>]*name[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*content[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*>", 34);
    private static final Pattern HX_PATTERN = Pattern.compile("<[\\s]*(H[1-6])[^>]*?>(.*?)</\\1>", 34);

    /* loaded from: input_file:com/datumbox/framework/utilities/text/cleaners/HTMLCleaner$HyperlinkPart.class */
    public enum HyperlinkPart {
        HTMLTAG,
        URL,
        ANCHORTEXT
    }

    public static String replaceImgWithAlt(String str) {
        Matcher matcher = IMG_ALT_TITLE_PATTERN.matcher(str);
        return matcher.find() ? matcher.replaceAll(" $1 ") : str;
    }

    public static String removeComments(String str) {
        return str.replaceAll("(?s)<!--.*?-->", "");
    }

    public static String unsafeRemoveAllTags(String str) {
        return str.replaceAll("\\<.*?>", " ");
    }

    public static String safeRemoveAllTags(String str) {
        return unsafeRemoveAllTags(removeNonTextTags(str));
    }

    private static String removeNonTextTags(String str) {
        String removeComments = removeComments(str);
        Matcher matcher = NON_TEXT_TAGS_PATTERN.matcher(removeComments);
        if (matcher.find()) {
            removeComments = matcher.replaceAll(" ");
        }
        return removeComments;
    }

    public static String removeNonTextTagsAndAttributes(String str) {
        String removeNonTextTags = removeNonTextTags(str);
        Matcher matcher = REMOVE_ATTRIBUTES_PATTERN.matcher(removeNonTextTags);
        if (matcher.find()) {
            removeNonTextTags = matcher.replaceAll("<$1$2>");
        }
        return StringEscapeUtils.unescapeHtml4(removeNonTextTags);
    }

    public static String extractText(String str) {
        return StringEscapeUtils.unescapeHtml4(safeRemoveAllTags(replaceImgWithAlt(str)));
    }

    private static String clear(String str) {
        return StringCleaner.removeExtraSpaces(StringEscapeUtils.unescapeHtml4(unsafeRemoveAllTags(str)));
    }

    public static String extractTitle(String str) {
        Matcher matcher = TITLE_PATTERN.matcher(str);
        if (matcher.find()) {
            return clear(matcher.group(0));
        }
        return null;
    }

    public static Map<HyperlinkPart, List<String>> extractHyperlinks(String str) {
        HashMap hashMap = new HashMap();
        hashMap.put(HyperlinkPart.HTMLTAG, new ArrayList());
        hashMap.put(HyperlinkPart.URL, new ArrayList());
        hashMap.put(HyperlinkPart.ANCHORTEXT, new ArrayList());
        Matcher matcher = HYPERLINK_PATTERN.matcher(str);
        while (matcher.find()) {
            if (matcher.groupCount() == 2) {
                String group = matcher.group(0);
                String group2 = matcher.group(1);
                String group3 = matcher.group(2);
                ((List) hashMap.get(HyperlinkPart.HTMLTAG)).add(group);
                ((List) hashMap.get(HyperlinkPart.URL)).add(group2);
                ((List) hashMap.get(HyperlinkPart.ANCHORTEXT)).add(group3);
            }
        }
        return hashMap;
    }

    public static Map<String, String> extractMetatags(String str) {
        HashMap hashMap = new HashMap();
        Matcher matcher = METATAG_PATTERN.matcher(str);
        while (matcher.find()) {
            if (matcher.groupCount() == 2) {
                hashMap.put(clear(matcher.group(1)), clear(matcher.group(2)));
            }
        }
        return hashMap;
    }

    public static Map<String, List<String>> extractHTMLheaders(String str) {
        HashMap hashMap = new HashMap();
        for (int i = 1; i <= 6; i++) {
            hashMap.put("H" + i, new ArrayList());
        }
        Matcher matcher = HX_PATTERN.matcher(str);
        while (matcher.find()) {
            if (matcher.groupCount() == 2) {
                ((List) hashMap.get(matcher.group(1).toUpperCase())).add(clear(matcher.group(2)));
            }
        }
        return hashMap;
    }
}
