/*
 * Decompiled with CFR 0.152.
 */
package dev.langchain4j.data.document.transformer;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentTransformer;
import dev.langchain4j.data.document.Metadata;
import java.util.Map;
import java.util.stream.Collectors;
import org.jsoup.Jsoup;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HtmlTextExtractor
implements DocumentTransformer {
    private static final Logger log = LoggerFactory.getLogger(HtmlTextExtractor.class);
    private final String cssSelector;
    private final Map<String, String> metadataCssSelectors;
    private final boolean includeLinks;

    public HtmlTextExtractor() {
        this(null, null, false);
    }

    public HtmlTextExtractor(String cssSelector, Map<String, String> metadataCssSelectors, boolean includeLinks) {
        this.cssSelector = cssSelector;
        this.metadataCssSelectors = metadataCssSelectors;
        this.includeLinks = includeLinks;
    }

    @Override
    public Document transform(Document document) {
        String html = document.text();
        String baseUrl = document.metadata("url") != null ? document.metadata("url") : "";
        org.jsoup.nodes.Document jsoupDocument = Jsoup.parse(html, baseUrl);
        String text = this.cssSelector != null ? HtmlTextExtractor.extractText(jsoupDocument, this.cssSelector, this.includeLinks) : HtmlTextExtractor.extractText(jsoupDocument, this.includeLinks);
        Metadata metadata = document.metadata().copy();
        if (this.metadataCssSelectors != null) {
            this.metadataCssSelectors.forEach((metadataKey, cssSelector) -> metadata.put((String)metadataKey, jsoupDocument.select((String)cssSelector).text()));
        }
        return Document.from(text, metadata);
    }

    private static String extractText(org.jsoup.nodes.Document jsoupDocument, String cssSelector, boolean includeLinks) {
        return jsoupDocument.select(cssSelector).stream().map(element -> HtmlTextExtractor.extractText(element, includeLinks)).collect(Collectors.joining("\n\n"));
    }

    private static String extractText(Element element, boolean includeLinks) {
        TextExtractingVisitor visitor = new TextExtractingVisitor(includeLinks);
        NodeTraversor.traverse((NodeVisitor)visitor, element);
        return ((Object)visitor).toString().trim();
    }

    private static class TextExtractingVisitor
    implements NodeVisitor {
        private final StringBuilder textBuilder = new StringBuilder();
        private final boolean includeLinks;

        private TextExtractingVisitor(boolean includeLinks) {
            this.includeLinks = includeLinks;
        }

        @Override
        public void head(Node node, int depth) {
            String name = node.nodeName();
            if (node instanceof TextNode) {
                this.textBuilder.append(((TextNode)node).text());
            } else if (name.equals("li")) {
                this.textBuilder.append("\n * ");
            } else if (name.equals("dt")) {
                this.textBuilder.append("  ");
            } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "h6", "tr")) {
                this.textBuilder.append("\n");
            }
        }

        @Override
        public void tail(Node node, int depth) {
            String name = node.nodeName();
            if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5", "h6")) {
                this.textBuilder.append("\n");
            } else if (this.includeLinks && name.equals("a")) {
                String link = node.absUrl("href");
                if (link.isEmpty() && node.baseUri().isEmpty()) {
                    log.warn("No 'URL' metadata found for document. Link will be empty");
                }
                this.textBuilder.append(String.format(" <%s>", link));
            }
        }

        public String toString() {
            return this.textBuilder.toString();
        }
    }
}

