package dev.langchain4j.data.document.transformer;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentTransformer;
import dev.langchain4j.data.document.Metadata;
import java.util.Map;
import java.util.stream.Collectors;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;
import org.jsoup.Jsoup;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:dev/langchain4j/data/document/transformer/HtmlTextExtractor.class */
public class HtmlTextExtractor implements DocumentTransformer {
    private static final Logger log = LoggerFactory.getLogger(HtmlTextExtractor.class);
    private final String cssSelector;
    private final Map<String, String> metadataCssSelectors;
    private final boolean includeLinks;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:dev/langchain4j/data/document/transformer/HtmlTextExtractor$TextExtractingVisitor.class */
    public static class TextExtractingVisitor implements NodeVisitor {
        private final StringBuilder textBuilder;
        private final boolean includeLinks;

        private TextExtractingVisitor(boolean z) {
            this.textBuilder = new StringBuilder();
            this.includeLinks = z;
        }

        @Override // org.jsoup.select.NodeVisitor
        public void head(Node node, int i) {
            String nodeName = node.nodeName();
            if (node instanceof TextNode) {
                this.textBuilder.append(((TextNode) node).text());
                return;
            }
            if (nodeName.equals("li")) {
                this.textBuilder.append("\n * ");
            } else if (nodeName.equals("dt")) {
                this.textBuilder.append("  ");
            } else if (StringUtil.in(nodeName, WindowFeatureGenerator.PREV_PREFIX, "h1", "h2", "h3", "h4", "h5", "h6", "tr")) {
                this.textBuilder.append("\n");
            }
        }

        @Override // org.jsoup.select.NodeVisitor
        public void tail(Node node, int i) {
            String nodeName = node.nodeName();
            if (StringUtil.in(nodeName, "br", "dd", "dt", WindowFeatureGenerator.PREV_PREFIX, "h1", "h2", "h3", "h4", "h5", "h6")) {
                this.textBuilder.append("\n");
                return;
            }
            if (this.includeLinks && nodeName.equals("a")) {
                String absUrl = node.absUrl("href");
                if (absUrl.isEmpty() && node.baseUri().isEmpty()) {
                    HtmlTextExtractor.log.warn("No 'URL' metadata found for document. Link will be empty");
                }
                this.textBuilder.append(String.format(" <%s>", absUrl));
            }
        }

        public String toString() {
            return this.textBuilder.toString();
        }
    }

    public HtmlTextExtractor() {
        this(null, null, false);
    }

    public HtmlTextExtractor(String str, Map<String, String> map, boolean z) {
        this.cssSelector = str;
        this.metadataCssSelectors = map;
        this.includeLinks = z;
    }

    @Override // dev.langchain4j.data.document.DocumentTransformer
    public Document transform(Document document) {
        org.jsoup.nodes.Document parse = Jsoup.parse(document.text(), document.metadata("url") != null ? document.metadata("url") : "");
        String extractText = this.cssSelector != null ? extractText(parse, this.cssSelector, this.includeLinks) : extractText(parse, this.includeLinks);
        Metadata copy = document.metadata().copy();
        if (this.metadataCssSelectors != null) {
            this.metadataCssSelectors.forEach((str, str2) -> {
                copy.put(str, parse.select(str2).text());
            });
        }
        return Document.from(extractText, copy);
    }

    private static String extractText(org.jsoup.nodes.Document document, String str, boolean z) {
        return (String) document.select(str).stream().map(element -> {
            return extractText(element, z);
        }).collect(Collectors.joining("\n\n"));
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static String extractText(Element element, boolean z) {
        TextExtractingVisitor textExtractingVisitor = new TextExtractingVisitor(z);
        NodeTraversor.traverse(textExtractingVisitor, element);
        return textExtractingVisitor.toString().trim();
    }
}
