package org.apache.any23.extractor.html;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Locale;
import org.apache.any23.validator.DefaultValidator;
import org.apache.any23.validator.ValidatorException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

/* loaded from: input_file:org/apache/any23/extractor/html/TagSoupParser.class */
public class TagSoupParser {
    public static final String ELEMENT_LOCATION = "Element-Location";
    private static final Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
    private final InputStream input;
    private final String documentIRI;
    private final String encoding;
    private final TagSoupParsingConfiguration config;
    private Document result;

    /* loaded from: input_file:org/apache/any23/extractor/html/TagSoupParser$ElementLocation.class */
    public static class ElementLocation {
        private int beginLineNumber;
        private int beginColumnNumber;
        private int endLineNumber;
        private int endColumnNumber;

        private ElementLocation(int i, int i2, int i3, int i4) {
            this.beginLineNumber = i;
            this.beginColumnNumber = i2;
            this.endLineNumber = i3;
            this.endColumnNumber = i4;
        }

        public int getBeginLineNumber() {
            return this.beginLineNumber;
        }

        public int getBeginColumnNumber() {
            return this.beginColumnNumber;
        }

        public int getEndLineNumber() {
            return this.endLineNumber;
        }

        public int getEndColumnNumber() {
            return this.endColumnNumber;
        }
    }

    public TagSoupParser(InputStream inputStream, String str) {
        this.result = null;
        this.input = inputStream;
        this.documentIRI = str;
        this.encoding = null;
        this.config = TagSoupParsingConfiguration.getDefault();
    }

    public TagSoupParser(InputStream inputStream, String str, String str2) {
        this.result = null;
        if (str2 != null && !Charset.isSupported(str2)) {
            throw new UnsupportedCharsetException(String.format(Locale.ROOT, "Charset %s is not supported", str2));
        }
        this.input = inputStream;
        this.documentIRI = str;
        this.encoding = str2;
        this.config = TagSoupParsingConfiguration.getDefault();
    }

    public Document getDOM() throws IOException {
        if (this.result == null) {
            long currentTimeMillis = System.currentTimeMillis();
            try {
                this.result = this.config.parse(this.input, this.documentIRI, this.encoding);
                logger.debug("Parsed " + this.documentIRI + " with " + this.config.name() + ", " + (System.currentTimeMillis() - currentTimeMillis) + "ms");
            } catch (Throwable th) {
                logger.debug("Parsed " + this.documentIRI + " with " + this.config.name() + ", " + (System.currentTimeMillis() - currentTimeMillis) + "ms");
                throw th;
            }
        }
        this.result.setDocumentURI(this.documentIRI);
        return this.result;
    }

    public DocumentReport getValidatedDOM(boolean z) throws IOException, ValidatorException {
        try {
            URI uri = new URI(this.documentIRI);
            DefaultValidator defaultValidator = new DefaultValidator();
            Document dom = getDOM();
            return new DocumentReport(defaultValidator.validate(uri, dom, z), dom);
        } catch (IllegalArgumentException | URISyntaxException e) {
            throw new ValidatorException("Error while performing validation, invalid document IRI.", e);
        }
    }
}
