package org.apache.any23.encoding;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import org.apache.tika.detect.TextStatistics;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.PseudoTextElement;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.ParseErrorList;
import org.jsoup.parser.Parser;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

/* loaded from: input_file:org/apache/any23/encoding/TikaEncodingDetector.class */
public class TikaEncodingDetector implements EncodingDetector {
    private static final String TAG_CHARS = "< />";
    private static final byte[] TAG_BYTES = TAG_CHARS.getBytes(StandardCharsets.UTF_8);
    private static final Node[] EMPTY_NODES = new Node[0];

    /* JADX INFO: Access modifiers changed from: private */
    @FunctionalInterface
    /* loaded from: input_file:org/apache/any23/encoding/TikaEncodingDetector$InputStreamFunction.class */
    public interface InputStreamFunction<E> {
        E compute(InputStream inputStream) throws IOException;
    }

    public String guessEncoding(InputStream inputStream) throws IOException {
        return guessEncoding(inputStream, (String) null);
    }

    private static Charset guessEncoding(InputStream inputStream, Charset charset) throws IOException {
        Charset correctVariant;
        if (!inputStream.markSupported()) {
            inputStream = new BufferedInputStream(inputStream);
        }
        TextStatistics textStatistics = (TextStatistics) computeAndReset(inputStream, EncodingUtils::stats);
        if (textStatistics.looksLikeUTF8()) {
            return StandardCharsets.UTF_8;
        }
        Charset correctVariant2 = EncodingUtils.correctVariant(textStatistics, charset);
        if (correctVariant2 != null) {
            return correctVariant2;
        }
        String str = (String) computeAndReset(inputStream, EncodingUtils::iso_8859_1);
        Charset xmlCharset = EncodingUtils.xmlCharset(textStatistics, str);
        if (xmlCharset != null) {
            return xmlCharset;
        }
        ParseErrorList tracking = ParseErrorList.tracking(Integer.MAX_VALUE);
        Document parseFragment = parseFragment(str, tracking);
        Charset htmlCharset = EncodingUtils.htmlCharset(textStatistics, parseFragment);
        if (htmlCharset != null) {
            return htmlCharset;
        }
        if (textStatistics.countEightBit() == 0) {
            return StandardCharsets.UTF_8;
        }
        long countTags = countTags(parseFragment);
        long count = tracking.stream().map((v0) -> {
            return v0.getErrorMessage();
        }).filter(str2 -> {
            return str2 != null && str2.matches(".*'[</>]'.*");
        }).count();
        boolean z = true;
        if (countTags < 5 || countTags / 5 < count) {
            z = false;
        } else {
            String wholeText = wholeText(parseFragment);
            if (wholeText.length() >= 100 || str.length() <= 600) {
                str = wholeText;
            } else {
                z = false;
            }
        }
        byte[] bytes = str.getBytes(StandardCharsets.ISO_8859_1);
        CharsetDetector charsetDetector = new CharsetDetector(bytes.length);
        charsetDetector.setText(bytes);
        for (CharsetMatch charsetMatch : charsetDetector.detectAll()) {
            try {
                Charset forName = EncodingUtils.forName(charsetMatch.getName());
                if ((!z || TAG_CHARS.equals(new String(TAG_BYTES, forName))) && (correctVariant = EncodingUtils.correctVariant(textStatistics, forName)) != null) {
                    return correctVariant;
                }
            } catch (Exception e) {
            }
        }
        return EncodingUtils.correctVariant(textStatistics, StandardCharsets.ISO_8859_1);
    }

    public String guessEncoding(InputStream inputStream, String str) throws IOException {
        return guessEncoding(inputStream, EncodingUtils.contentTypeCharset(str)).name();
    }

    private static <E> E computeAndReset(InputStream inputStream, InputStreamFunction<E> inputStreamFunction) throws IOException {
        inputStream.mark(Integer.MAX_VALUE);
        try {
            return inputStreamFunction.compute(inputStream);
        } finally {
            inputStream.reset();
        }
    }

    private static Document parseFragment(String str, ParseErrorList parseErrorList) {
        Document document = new Document("");
        for (Node node : (Node[]) Parser.parseFragment(str, (Element) null, "", parseErrorList).toArray(EMPTY_NODES)) {
            if (node.parentNode() != null) {
                node.remove();
            }
            document.appendChild(node);
        }
        return document;
    }

    private static long countTags(Node node) {
        final long[] jArr = {0};
        NodeTraversor.traverse(new NodeVisitor() { // from class: org.apache.any23.encoding.TikaEncodingDetector.1
            public void head(Node node2, int i) {
                if ((node2 instanceof Document) || (node2 instanceof PseudoTextElement)) {
                    return;
                }
                if ((node2 instanceof Element) || (node2 instanceof DocumentType) || (node2 instanceof Comment)) {
                    long[] jArr2 = jArr;
                    jArr2[0] = jArr2[0] + (node2.childNodeSize() == 0 ? 1L : 2L);
                }
            }

            public void tail(Node node2, int i) {
            }
        }, node);
        return jArr[0];
    }

    private static String wholeText(Node node) {
        final StringBuilder sb = new StringBuilder();
        NodeTraversor.traverse(new NodeVisitor() { // from class: org.apache.any23.encoding.TikaEncodingDetector.2
            public void head(Node node2, int i) {
                if (node2 instanceof TextNode) {
                    sb.append(((TextNode) node2).getWholeText());
                    return;
                }
                if (node2 instanceof DataNode) {
                    String wholeData = ((DataNode) node2).getWholeData();
                    while (!"script".equalsIgnoreCase(node2.nodeName())) {
                        if ("style".equalsIgnoreCase(node2.nodeName())) {
                            return;
                        }
                        node2 = node2.parentNode();
                        if (node2 == null) {
                            return;
                        }
                    }
                    if (node2.attr("type").toLowerCase(Locale.ROOT).contains("json")) {
                        sb.append(wholeData);
                        return;
                    }
                    return;
                }
                if (!(node2 instanceof Comment)) {
                    if (node2 instanceof Element) {
                        sb.append(node2.attr("content"));
                    }
                } else {
                    String data = ((Comment) node2).getData();
                    if (data.contains("<!") || data.contains("<?")) {
                        return;
                    }
                    sb.append(data);
                }
            }

            public void tail(Node node2, int i) {
            }
        }, node);
        return sb.toString();
    }
}
